Integrative analysis reveals therapeutic potential of pyrviniym pamoate in Merkel cell carcinoma


back to home

IMR90 WGCNA analysis

library(limma)
library(edgeR)
library(ggplot2)
library(RColorBrewer)
library(stringr)
library(ggrepel)
library(plotly)
library(processx)
library(biomaRt)
library(WGCNA)
library(clusterProfiler)
library(org.Hs.eg.db)
library(ggpubr)
library(dplyr)
library(igraph)
library(DT)
library(networkD3)
library(netZooR)
library(ComplexHeatmap)
library(corrplot)
library(directlabels)

Importing data to the environment

#IMR90 normalized expression matrix =========
df<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_wgcna/IMR90_edat.RDS")
IMR90_ER_edat<-df[,grep("ER", colnames(df))]
IMR90_GFP_edat<-df[,grep("GFP", colnames(df))]
IMR90_ER_GFP_edat<-cbind(IMR90_GFP_edat, IMR90_ER_edat)

IMR90 data processing

#ER_samples =======================
df_er<-df[, grep("ER", colnames(df))]

df_er.pca = prcomp(t(df_er), center = TRUE, scale = TRUE)
group_er<-factor(c(rep("ER_0", 3), rep("ER_4", 3), rep("ER_8", 3), 
                   rep("ER_12", 3), rep("ER_16", 3), rep("ER_20", 3), 
                   rep("ER_24", 3), rep("ER_32", 3), rep("ER_40", 2), rep("ER_48",3)),
                   levels = c("ER_0", "ER_4", "ER_8", "ER_12", "ER_16", "ER_20",
                              "ER_24", "ER_32", "ER_40", "ER_48"))

df_er.plot=data.frame(df_er.pca$x[,1],df_er.pca$x[,2],group_er, rownames(df_er.pca$x))
colnames(df_er.plot)=c("PC1","PC2","group_er","cond")
rownames(df_er.plot)=rownames(df_er.pca$x)
percentage_er <- round(df_er.pca$sdev / sum(df_er.pca$sdev) * 100, 2)
percentage_er_lab <- paste(colnames(df_er.plot), "(", paste( as.character(percentage_er), "%", ")", sep=""))

#GFP_samples =========================
df_gfp<-df[, grep("GFP", colnames(df))]

df_gfp.pca = prcomp(t(df_gfp), center = TRUE, scale = TRUE)
group_gfp<-factor(c(rep("GFP_0", 3), rep("GFP_4", 2), rep("GFP_8", 3), 
                   rep("GFP_12", 1), rep("GFP_16", 3), rep("GFP_20", 3), 
                   rep("GFP_24", 3), rep("GFP_32", 3), rep("GFP_40", 3), rep("GFP_48",3)),
                   levels = c("GFP_0", "GFP_4", "GFP_8", "GFP_12", "GFP_16", "GFP_20",
                              "GFP_24", "GFP_32", "GFP_40", "GFP_48"))

df_gfp.plot=data.frame(df_gfp.pca$x[,1],df_gfp.pca$x[,2],group_gfp, rownames(df_gfp.pca$x))
colnames(df_gfp.plot)=c("PC1","PC2","group_gfp","cond")
rownames(df_gfp.plot)=rownames(df_gfp.pca$x)
percentage_gfp <- round(df_gfp.pca$sdev / sum(df_gfp.pca$sdev) * 100, 2)
percentage_gfp_lab <- paste(colnames(df_gfp.plot), "(", paste( as.character(percentage_gfp), "%", ")", sep=""))

Plotting 3D PCA plot of IMR90 ER and GFP samples

tot_explained_variance_ratio_er <- summary(df_er.pca)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio_er<- 100 * sum(tot_explained_variance_ratio_er)

tit_er<-paste0("Total Explained Variance = ", tot_explained_variance_ratio_er, "\n PCA of normalized IMR90 ER samples")

components_er<-data.frame(df_er.pca$x[,1],df_er.pca$x[,2],df_er.pca$x[,3], group_er, rownames(df_er.pca$x))
colnames(components_er)<-c("PC1","PC2", "PC3", "group_er", "sample_names")

axx <- list(
  title = paste0("PC1 (", percentage_er[1], ")" ))
axy <- list(
  title = paste0("PC2 (", percentage_er[2], ")" ))

axz <- list(
  title = paste0("PC3 (", percentage_er[3], ")" ))

fig_er <- plot_ly(components_er, x = ~PC1, y = ~PC2, z = ~PC3, color = ~group_er, colors = c('#636EFA','#EF553B','#00CC96')) %>%
  add_markers(size = 28)


fig_er <- fig_er %>%
  layout(
    title = tit_er,
    scene = list(bgcolor = "white", 
                 xaxis=axx, 
                 yaxis=axy, 
                 zaxis=axz)
    )

fig_er
tot_explained_variance_ratio_gfp <- summary(df_gfp.pca)[["importance"]]['Proportion of Variance',]
tot_explained_variance_ratio_gfp<- 100 * sum(tot_explained_variance_ratio_gfp)

tit_gfp<-paste0("Total Explained Variance = ", tot_explained_variance_ratio_gfp, "\n PCA of normalized IMR90 GFP samples")

components_gfp<-data.frame(df_gfp.pca$x[,1],df_gfp.pca$x[,2],df_gfp.pca$x[,3], group_gfp, rownames(df_gfp.pca$x))
colnames(components_gfp)<-c("PC1","PC2", "PC3", "group_gfp", "sample_names")

axx <- list(
  title = paste0("PC1 (", percentage_gfp[1], ")" ))

axy <- list(
  title = paste0("PC2 (", percentage_gfp[2], ")" ))

axz <- list(
  title = paste0("PC3 (", percentage_gfp[3], ")" ))


fig_gfp <- plot_ly(components_gfp, x = ~PC1, y = ~PC2, z = ~PC3, color = ~group_gfp, colors = c('#636EFA','#EF553B','#00CC96')) %>%
  add_markers(size = 28)


fig_gfp <- fig_gfp %>%
  layout(
    title = tit_gfp,
    scene = list(bgcolor = "white",
                 xaxis=axx, 
                 yaxis=axy, 
                 zaxis=axz))

fig_gfp

Diffierentially expressed genes analysis of IMR90 ER and GFP samples at 48h

group_48h_DEGs<-factor(c(rep("GFP_48",3), rep("ER_48",3)), levels = c("GFP_48", "ER_48"))
design<-model.matrix(~0 + group_48h_DEGs)
IMR90_48h_ER_GFP<-df[,c(grep("GFP_48",colnames(df)), grep("ER_48", colnames(df)))]
comparison = "group_48h_DEGsER_48 - group_48h_DEGsGFP_48"
cont=makeContrasts(comparison,levels=design)
fit=lmFit(IMR90_48h_ER_GFP,design)
vfit=contrasts.fit(fit,contrasts=cont)
efit<- eBayes(vfit)
DEGs_48h<-topTreat(efit, sort.by = "P", n = Inf)
DEGs_48h_sig_gene_list<-DEGs_48h[abs(DEGs_48h$logFC)>=0 & DEGs_48h$adj.P.Val <= 0.05,]
IMR90_ER_sig_DEGs<-df[rownames(DEGs_48h_sig_gene_list), grep("ER", colnames(df))]

Gene co-expression analysis by WGCNA with IMR90 ER samples

#Clustering by WGCNA (IMR90 48h ER vs.GFP DEGs with abs lfc > = 0 & padj <= 0.05,  total 10513 genes) 

nettype = "signed"
powers = c(c(1:10), seq(from = 12, to=20, by=2))
IMR90_ER_expression_data<-IMR90_ER_sig_DEGs

sft_ER=pickSoftThreshold(t(IMR90_ER_expression_data), dataIsExpr = TRUE, powerVector = powers, networkType = nettype, verbose = 5)

# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft_ER$fitIndices[,1], -sign(sft_ER$fitIndices[,3])*sft_ER$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab= paste0("Scale Free Topology Model Fit ", nettype, " R^2"), type="n",
     main = paste("Scale independence"))+text(sft_ER$fitIndices[,1], -sign(sft_ER$fitIndices[,3])*sft_ER$fitIndices[,2],
     labels=powers,cex=1,col="red")+abline(h=0.90,col="red")

# Mean connectivity as a function of the soft-thresholding power
plot(sft_ER$fitIndices[,1], sft_ER$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))+text(sft_ER$fitIndices[,1], sft_ER$fitIndices[,5], labels=powers, cex=1, col="red")


##Create co-expression matrix=========================
cor <- WGCNA::cor
net_ER = blockwiseModules(
  t(IMR90_ER_expression_data),
  power = 16 , #https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html Q#6 or based on powerEstimate
  TOMType = "signed", 
  minModuleSize = 30,   # We like large modules, so we set the minimum module size relatively high
  reassignThreshold = 0, 
  mergeCutHeight = 0.25,
  numericLabels = TRUE, 
  pamRespectsDendro = FALSE,
  saveTOMs = F,
  verbose = 3
)
table(net_ER$colors)
cor<-stats::cor

Gene modules analysis on IMR90 ER samples

IMR90_ER_expression_data<-IMR90_ER_sig_DEGs
sizeGrWindow(12,10)
moduleLabels = net_ER$colors
mergedColors = labels2colors(net_ER$colors)
plotDendroAndColors(net_ER$dendrograms[[1]], mergedColors[net_ER$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
reference_df<-data.frame(gene = names(moduleLabels), number = unname(moduleLabels), color = mergedColors)

MEs = net_ER$MEs;
geneTree = net_ER$dendrograms[[1]];

# Module identification using dynamic tree cut:
minModuleSize = 30
dynamicMods = cutreeDynamic(dendro = geneTree,
                            deepSplit = 2, 
                            pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);

# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)

# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

# Module-trait(timepoint) relationship for ER samples
design_ER<-colnames(IMR90_ER_expression_data)
design_ER<-unlist(lapply(design_ER, function(x) substring(x, 1, nchar(x)-2)))
design_ER<-gsub("_48", "_t5",design_ER)
design_ER<-gsub("_40", "_t5",design_ER)
design_ER<-gsub("_32", "_t4",design_ER)
design_ER<-gsub("_24", "_t4",design_ER)
design_ER<-gsub("_20", "_t3",design_ER)
design_ER<-gsub("_16", "_t3",design_ER)
design_ER<-gsub("_12", "_t2",design_ER)
design_ER<-gsub("_8", "_t2",design_ER)
design_ER<-gsub("_4", "_t1",design_ER)
design_ER<-gsub("_0", "_t1",design_ER)
design_er = model.matrix(~0 + design_ER)
colnames(design_er) = levels(as.factor(design_ER))

moduleTraitCor = cor(MEs, design_er, method = "kendall")
nSamples = ncol(t(IMR90_ER_expression_data))
moduleTraitPvalue_fisher = corPvalueFisher(moduleTraitCor, nSamples) #Fisher's asymptotic p-value for correlation

sizeGrWindow(10,6)
# Display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue_fisher, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(5, 5, 5, 5));

# Display the correlation values within a heatmap
colnames(MEs)<-substr(colnames(MEs), 3, nchar(colnames(MEs)))
colnames(MEs)<-paste0("Module ", colnames(MEs))
moduleTraitCor<-moduleTraitCor[,factor(c("ER_t1", "ER_t2", "ER_t3", "ER_t4", "ER_t5"), levels = c("ER_t1", "ER_t2", "ER_t3", "ER_t4", "ER_t5"))]
rownames(moduleTraitCor)<-paste0("Module_", unlist(lapply(rownames(moduleTraitCor), function(x) substr(x, 3, nchar(x)))))

labeledHeatmap(Matrix = moduleTraitCor,
               #xLabels = c("ER_0", "ER_early", "ER_middle", "ER_late"),
               xLabels = c("ER_t1", "ER_t2", "ER_t3", "ER_t4", "ER_t5"),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(100),
               textMatrix = textMatrix,
               setStdMargins = T,
               cex.text = 0.4,
               zlim = c(-1,1),
               main = paste("IMR90_ER: Module-time period relationships"))
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 30)

Eigengene analysis on IMR90 ER samples

# 1. Performing Eigengene analysis on gene modules across time in IMR90 ER samples
ER_time_point<-c("ER_0", "ER_4", "ER_8", 
                 "ER_12", "ER_16", "ER_20", 
                 "ER_24", "ER_32", "ER_40", 
                 "ER_48")

MEs_mean<-data.frame(modules = rownames(t(MEs)))
for (i in 1:length(ER_time_point)){
  time_point<-ER_time_point[i]
  sub_df<-as.data.frame(t(MEs)[, grep(time_point, colnames(t(MEs)))])
  sub_df[,"mean"]<-apply(sub_df, 1, mean)
  df_mean<-as.data.frame(sub_df[, "mean"])
  colnames(df_mean)<-time_point
  MEs_mean<-cbind(MEs_mean, df_mean)
}

rownames(MEs_mean)<-MEs_mean$modules
MEs_mean<-MEs_mean[,-1]
MEs_mean_df<-reshape2::melt(as.matrix(MEs_mean))

colnames(MEs_mean_df)<-c("Module", "time", "Eigengene")
MEs_mean_df$time<-unlist(lapply(as.character(MEs_mean_df$time), function(x) strsplit(x, "_")[[1]][2]))
MEs_mean_df$time<-as.numeric(MEs_mean_df$time)
MEs_mean_df$Eigengene<-as.numeric(MEs_mean_df$Eigengene)

theme<-theme(panel.background = element_blank(),
             panel.border=element_rect(fill=NA),
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             strip.background=element_blank(),
             axis.text.x=element_text(colour="black"),
             axis.text.y=element_text(colour="black"),
             axis.ticks=element_line(colour="black"),
             plot.margin=unit(c(1,1,1,1),"line"),
             axis.title.x = element_text(color="black", size=20, face="bold"),
             axis.title.y = element_text(color="black", size=20, face="bold"),
             axis.text = element_text(size = 20, face = "bold"))

for (i in 1:length(rownames(MEs_mean))){
  module<-rownames(MEs_mean)[i]
  df_module<-MEs_mean_df[MEs_mean_df$Module == module,]
  p<-ggplot(df_module, 
       aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)),
           y=Eigengene, colour=Module, group=Module, fill=Module)) +
  geom_line(size =0.1)+
  geom_point(size=0.5)+
  ylim(-0.4, 0.4)+
  labs(x = 'time (h)', y = "Eigengene",title = 'Eigengenes of the each WGCNA modules')
}

# 2. Putting the dynamic Eigengene curve of each module into groups based on the distance of modules

  p1<-ggplot(MEs_mean_df[MEs_mean_df$Module %in% c("Module 14", "Module 1", "Module 11"), ],
      mapping = aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)), y=Eigengene, group = Module, fill = Module)) +
  geom_line(aes(color=Module), size = 0.8)+
  geom_point(size=2, shape = 21)+
  scale_fill_brewer(palette = "Blues")+
  scale_color_brewer(palette = "Blues")+
  ylim(-0.4, 0.4)+
  labs(x = 'time (h)', y = "Eigengene")

  p1<-p1+theme

  p2<-ggplot(MEs_mean_df[MEs_mean_df$Module %in% c("Module 13", "Module 5", "Module 10", "Module 12"), ],
      mapping = aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)), y=Eigengene, group = Module, fill = Module)) +
  geom_line(aes(color=Module), size = 0.8)+
  geom_point(size=2, shape = 21)+
  scale_fill_brewer(palette = "Greens")+
  scale_color_brewer(palette = "Greens")+
  ylim(-0.4, 0.4)+
  labs(x = 'time (h)', y = "Eigengene")

  p2<-p2+theme
  
  p3<-ggplot(MEs_mean_df[MEs_mean_df$Module %in% c("Module 4", "Module 3", "Module 8"), ],
      mapping = aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)), y=Eigengene, group = Module, fill = Module)) +
  geom_line(aes(color=Module), size = 0.8)+
  geom_point(size=2, shape = 21)+
  scale_fill_brewer(palette = "Oranges")+
  scale_color_brewer(palette = "Oranges")+
  ylim(-0.4, 0.4)+
  labs(x = 'time (h)', y = "Eigengene")

  p3<-p3+theme
  
  p4<-ggplot(MEs_mean_df[MEs_mean_df$Module %in% c("Module 6", "Module 9", "Module 7", "Module 2"), ],
      mapping = aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)), y=Eigengene, group = Module, fill = Module)) +
  geom_line(aes(color=Module), size = 0.8)+
  geom_point(size=2, shape = 21)+
  scale_fill_brewer(palette = "Purples")+
  scale_color_brewer(palette = "Purples")+
  ylim(-0.4, 0.4)+
  labs(x = 'time (h)', y = "Eigengene")

  p4<-p4+theme

  WGCNA_Eigengenes_plot<-ggarrange(p1, p2, p3, p4, 
              ncol = 2,
              nrow = 2
) 
 WGCNA_Eigengenes_plot

GO-terms analysis on gene modules from IMR90 ER samples

probes = colnames(t(IMR90_ER_expression_data))
g_list<-getBM(filters = "hgnc_symbol", 
                       attributes= c("ensembl_gene_id","hgnc_symbol", "entrezgene_id"),values=probes,mart= genemart)
gene_list<-list()
for (i in 1:length(unique(moduleLabels))){
  number<-unique(moduleLabels)[i]
  module_name<-paste0("Module_", number)
  modnumber<-probes[moduleLabels == number]
  df_genes<-g_list[g_list$hgnc_symbol %in% modnumber,"ensembl_gene_id"]
  gene_list[[module_name]]<-df_genes
}

gene_df<-list()
module_name_list<-paste0("Module_", c(1:14))
for (i in 1:14){
  module_name<-module_name_list[i] #remove Module_0, since it's the module with genes that do not co-express with others
  genes<-gene_list[[module_name]]
  df_genes<-g_list[g_list$ensembl_gene_id %in% genes,]
  df_genes<-df_genes[!duplicated(df_genes$hgnc_symbol),]
  gene_df[[module_name]]<-df_genes
}
##GO-Term enrichment based on WGCNA network================================
WGCNA_modules_go_result<-list()
for (i in 1:length(gene_df)){
  module_name<-names(gene_df)[i]
  genes<-gene_df[[module_name]]
  genes<-genes$hgnc_symbol
  em_ER_BP<-enrichGO(genes, 
                     OrgDb = org.Hs.eg.db, 
                     ont = "BP",
                   pAdjustMethod = "BH", 
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.25, 
                     keyType = 'SYMBOL')
 WGCNA_modules_go_result[[module_name]]<-em_ER_BP@result
}

Significant Go terms visualization for WGCNA modules

WGCNA_modules_go_result<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_wgcna/IMR90_WGCNA_modules_go_result.RDS")

color_pal<-c(brewer.pal(n = 3, "Blues"), brewer.pal(n = 4, "Greens"), brewer.pal(n = 3, "Oranges"), brewer.pal(n = 4, "Purples"))
sig_go_results_list_order<-paste0("Module_", c(14,1,11,13,5,10,12,4,3,8,6,9,7,2))

color_pal_new<-color_pal[c(1:9,11,13:14)]


sig_go_results_df<-data.frame()
for (i in 1:14){
  module<-as.character(sig_go_results_list_order[i])
  go_result<-WGCNA_modules_go_result[[module]]
  go_result[,"module"]<-module
  go_result<-go_result[go_result$p.adjust <= 0.05, ]
  go_result<-go_result[order(as.numeric(go_result$p.adjust), decreasing = F), ][1:20,]
  sig_go_results_df<-rbind(sig_go_results_df, go_result)
}
sig_go_results_df<-na.omit(sig_go_results_df)
sig_go_results_df<-sig_go_results_df[!duplicated(sig_go_results_df$Description),]

p<-ggplot(data=sig_go_results_df, aes(x=factor(Description, levels = sig_go_results_df$Description), y= -log10(p.adjust)), fill = module)+
    geom_col(aes(fill = module), colour = "black", width=0.85, position=position_dodge(0.5)) +
    theme(axis.text.x = element_text(size = 10, angle=90, vjust=.5, hjust=1, face = "bold"),
           panel.background = element_blank(),
           legend.position= c(0.13, 0.7),
           legend.direction = "horizontal",
           legend.title = element_text(colour="black", size=20, 
                                    face="bold"),
           legend.text = element_text(colour="black", size=15, 
                                     face="bold"),
           legend.background = element_rect(fill="grey90",
                                  linewidth =0.5, linetype="solid"),
           axis.text.y = element_text(size = 15, face = "bold"),
           plot.title = element_text(size = 20, face = "bold"),
           axis.title = element_text(size = 20, face = "bold"),
           axis.line = element_line(colour = "black"))+
     ylab("-log10(p.adjust)") + 
     xlab("GO Terms")+
     scale_y_continuous(breaks = seq(0, -log10(min(sig_go_results_df$p.adjust))*1.0, by = 5))+
     scale_fill_manual(values=color_pal_new, limits = c(unique(sig_go_results_df$module)))+
     scale_x_discrete(labels = function(x) str_wrap(x, width = 65))
p

sig_go_result_df_1<-sig_go_results_df[sig_go_results_df$module %in% c("Module_14", "Module_1", "Module_11", "Module_13", "Module_5"), ]

p_blue_green<-ggplot(data=sig_go_result_df_1, aes(x=factor(Description, levels = sig_go_result_df_1$Description), y= -log10(p.adjust)), fill = module)+
    geom_col(aes(fill = module), colour = "black", width=0.85, position=position_dodge(0.5)) +
    theme(axis.text.x = element_text(size = 25, angle=90, vjust=.5, hjust=1, face = "bold"),
           panel.background = element_blank(),
           legend.position= c(0.13, 0.7),
           legend.direction = "horizontal",
           legend.title = element_text(colour="black", size=20, 
                                    face="bold"),
           legend.text = element_text(colour="black", size=15, 
                                     face="bold"),
           legend.background = element_rect(fill="grey90",
                                  linewidth =0.5, linetype="solid"),
           axis.text.y = element_text(size = 20, face = "bold"),
           plot.title = element_text(size = 20, face = "bold"),
           axis.title = element_text(size = 20, face = "bold"),
           axis.line = element_line(colour = "black"))+
     ylab("-log10(p.adjust)") + 
     xlab("GO Terms")+
     scale_y_continuous(breaks = seq(0, -log10(min(sig_go_result_df_1$p.adjust))*1.0, by = 5))+
     scale_fill_manual(values=color_pal_new, limits = c(unique(sig_go_result_df_1$module)))+
     scale_x_discrete(labels = function(x) str_wrap(x, width = 65))
p_blue_green

Network centrality analysis and network visualization of gene modules from WGCNA

TOM = readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_wgcna/WGCNA_TOM.rds")
probes = colnames(t(IMR90_ER_expression_data))
dimnames(TOM)<-list(probes, probes)

# 1.Selecting top hub genes from each gene module
network_hub_list<-list()
nTOM = 40 #top nodes from the WGCNA network
for (i in 1:length(gene_df)){
  module<-names(gene_df)[i]
  genes<-gene_df[[module]][,"hgnc_symbol"]
  m<-TOM[genes, genes]
  hubs<-names(sort(rowSums(m), decreasing = T)[1:nTOM])
  network_hub_list[[module]]<-hubs
}

network_hub_list # The top 40 hubs in each module
## $Module_1
##  [1] "NIBAN2"  "CAPN1"   "INF2"    "MGAT4B"  "VPS28"   "FLOT2"   "FKBP8"  
##  [8] "FAM3A"   "GNB2"    "CDIPT"   "G6PD"    "RNPEPL1" "METRN"   "EHD2"   
## [15] "APRT"    "TSPAN4"  "FBXW5"   "PACS2"   "CAPG"    "TUBGCP2" "MAPK3"  
## [22] "BLVRB"   "SLC27A1" "VAT1"    "GNAI2"   "IGFBP6"  "DBNL"    "PIP5K1C"
## [29] "SPON2"   "MVP"     "IFITM3"  "ERCC2"   "PGLS"    "LY6E"    "NISCH"  
## [36] "TGFB1I1" "VEGFB"   "GSN"     "NPDC1"   "TSC2"   
## 
## $Module_2
##  [1] "FANCD2"   "MAD2L1"   "SMARCC1"  "CDK1"     "UCHL5"    "RRM2"    
##  [7] "SKA3"     "SMC4"     "CIP2A"    "NDC1"     "RRM2P3"   "CENPU"   
## [13] "NCAPG2"   "CENPN"    "ZNF714"   "HMGB2"    "PAICS"    "CBX1"    
## [19] "MELK"     "DDIAS"    "NDC80"    "SMC2"     "FANCI"    "CSE1L"   
## [25] "RAD51AP1" "NUP107"   "RFC5"     "SPC25"    "CCNB2"    "UBE2T"   
## [31] "CEP55"    "PBK"      "RFC3"     "SINHCAF"  "POLR3G"   "PAICSP4" 
## [37] "MTHFD2"   "DHX9"     "PHACTR4"  "MTF2"    
## 
## $Module_3
##  [1] "NUP153"  "NUP155"  "PPAT"    "BRCA2"   "HS2ST1"  "NAA50"   "ATAD5"  
##  [8] "MMS22L"  "SMCHD1"  "NUP50"   "XPOT"    "SLC16A1" "UTP20"   "SEH1L"  
## [15] "QSER1"   "ABCE1"   "NOC3L"   "ELOVL5"  "SASS6"   "NPAT"    "ZNF850" 
## [22] "IPO7"    "PDS5B"   "NUP58"   "DNA2"    "ATAD2"   "CHD1"    "AHCTF1" 
## [29] "HAUS6P1" "USP1"    "SCML2"   "NUDT21"  "KLHL23"  "PLOD2"   "ZNF678" 
## [36] "LRRC8B"  "ENC1"    "FMR1"    "MSH2"    "XPO4"   
## 
## $Module_4
##  [1] "OSBPL8"   "RLIM"     "RO60"     "CAMSAP2"  "ZFYVE16"  "DNAJB14" 
##  [7] "LTN1"     "ITCH"     "CACNA2D1" "NHLRC2"   "UBR3"     "ZBTB41"  
## [13] "TAF2"     "ZNF451"   "ERBIN"    "SEL1L"    "ATP11B"   "SOCS4"   
## [19] "NEDD4"    "EXOC5"    "ZNF800"   "TMF1"     "ARHGAP5"  "SLC16A7" 
## [25] "ITGAV"    "ACSL4"    "SMAD5"    "BMPR1A"   "MBNL1"    "EDEM1"   
## [31] "ZNF148"   "ATP7A"    "MOB1B"    "MDFIC"    "MAN2A1"   "SP3"     
## [37] "REST"     "TNKS2"    "MAN1A2"   "MARCHF6" 
## 
## $Module_5
##  [1] "THSD4"   "LMF1"    "IGFBP5"  "APCDD1L" "KCNMA1"  "H6PD"    "PHLDB1" 
##  [8] "CPA4"    "EPHB2"   "SIDT2"   "TNS1"    "SEMA7A"  "CD9"     "NPTX1"  
## [15] "MYPN"    "MYADM"   "GALNT10" "CCND1"   "SLC17A5" "GM2A"    "CDCP1"  
## [22] "BACE1"   "PLBD2"   "MGAT5"   "DHRS7"   "CXCL12"  "TGM2"    "SLC16A4"
## [29] "KREMEN1" "FBLN5"   "FAM120B" "LTBP2"   "MBTPS1"  "FOLR3"   "GALNT11"
## [36] "PTPRA"   "MEAK7"   "PSG4"    "LBH"     "WNT5A"  
## 
## $Module_6
##  [1] "RPS3AP6"  "RPS3AP26" "RPS3AP5"  "RPS3A"    "RPL9"     "RPS3AP21"
##  [7] "RPL9P7"   "RPL17P36" "RPS3AP20" "RPL7P9"   "RPL15"    "RPL7P32" 
## [13] "RPS3AP25" "RPL4P4"   "RPS3AP47" "RPL7"     "RPL17P34" "RPL6"    
## [19] "RPL5"     "RPL39"    "RPL5P4"   "RPL7P26"  "RPL4"     "SNHG5"   
## [25] "SF3B6"    "RPL7P1"   "RPL7P15"  "RPL5P34"  "RPL7P6"   "RPL7P24" 
## [31] "RPL39P3"  "RPS20P14" "RPL6P27"  "RPS23"    "RPS25"    "RPL17P43"
## [37] "RPS7P11"  "RPS3AP49" "RPL24P7"  "RPS4X"   
## 
## $Module_7
##  [1] "ALYREF"     "SMPD4"      "TK1"        "MAD2L2"     "CHTF18"    
##  [6] "SAE1"       "TOMM40"     "DTYMK"      "PPM1G"      "KHSRP"     
## [11] "TRAF4"      "SLC25A29"   "DCAF15"     "HNRNPUL1"   "UBN1"      
## [16] "FBL"        "HNRNPA1P63" "TCF3"       "TOMM40P4"   "SAFB"      
## [21] "LRRC20"     "LMNB2"      "CPSF4"      "TOMM40P1"   "RNPS1P1"   
## [26] "POP7"       "SGTA"       "RBM19"      "SAFB2"      "EZR"       
## [31] "TOMM34"     "H1-10"      "SNHG7"      "ITPKA"      "CNIH2"     
## [36] "CDKN2D"     "LAS1L"      "RANP1"      "ARRB2"      "TRAF2"     
## 
## $Module_8
##  [1] "USP12"    "PAK2"     "RPRD1A"   "ATP11C"   "UBA6"     "PANK3"   
##  [7] "ARPP19"   "SMARCAD1" "TNPO1"    "MAPK6"    "USP15"    "PAPOLA"  
## [13] "MYSM1"    "MARCHF7"  "PPP4R2"   "TAB2"     "ZNF518B"  "RP2"     
## [19] "DNM1L"    "STAG2"    "KIF5B"    "MOB1A"    "QKI"      "ITGA4"   
## [25] "LYPLA1"   "PGAP1"    "PPP6R3"   "PTPN4"    "MORC3"    "UBQLN1"  
## [31] "FAM91A1"  "AHR"      "SPRED1"   "PCNX4"    "AIDAP1"   "TTLL7"   
## [37] "SOAT1"    "BCLAF1P2" "CGGBP1"   "SSX2IP"  
## 
## $Module_9
##  [1] "FANCA"   "HAS3"    "POLE"    "SMC1A"   "SLC1A4"  "SLC29A1" "NUP188" 
##  [8] "LZTS1"   "RNF227"  "PTBP1"   "TCF19"   "TFDP1"   "MYO19"   "TMEM201"
## [15] "MICAL3"  "AKAP1"   "SCAF4"   "CDCA4"   "ICMT"    "CDT1"    "NAA40"  
## [22] "NPHP4"   "SAP130"  "GEMIN4"  "TMEM109" "TONSL"   "PASK"    "TLN2"   
## [29] "FHOD3"   "CAD"     "GPRIN1"  "NUP214"  "MCM2"    "UTP4"    "POFUT1" 
## [36] "MANEAL"  "DPF1"    "MCM5"    "CABLES2" "ACACB"  
## 
## $Module_10
##  [1] "DSP"        "CSGALNACT1" "CDH6"       "TRPC6"      "ANKLE2"    
##  [6] "CHST11"     "CCN2"       "CREB3L2"    "SIRPA"      "SMOC1"     
## [11] "B4GALT1"    "FLRT2"      "BHLHE40"    "AKAP12"     "NOX4"      
## [16] "ADAM19"     "F3"         "KCNH1"      "TFPI2"      "MYOCD"     
## [21] "EDN1"       "IL11"       "PCNX1"      "TCF7"       "DCBLD1"    
## [26] "CRISPLD2"   "S1PR1"      "TIMP3"      "COL4A1"     "ATP2A2"    
## [31] "SLC22A23"   "ABHD2"      "SIRPB1"     "HIVEP2"     "PTGS2"     
## [36] "TEX2"       "CSTF2T"     "UCK2"       "PRSS23"     "KLF7"      
## 
## $Module_11
##  [1] "VARS1"    "PPP1R12C" "GNA11"    "SRM"      "ARF1"     "BCAR1"   
##  [7] "AP1M1"    "MRPL4"    "PGP"      "PLCB3"    "IMPDH1"   "AACS"    
## [13] "CNN2"     "FARSA"    "PSMG4"    "BOP1"     "DHX30"    "TRIM11"  
## [19] "ALDH16A1" "MFN2"     "ATAD3C"   "SIVA1"    "GTPBP1"   "CCDC9"   
## [25] "CNP"      "ATAD3B"   "KLF16"    "SMTN"     "SMARCA4"  "CNOT3"   
## [31] "MBD3"     "THOP1"    "KIAA2013" "ALKBH5"   "PREB"     "PRADC1"  
## [37] "TTLL12"   "MTA1"     "C19orf25" "APBA3"   
## 
## $Module_12
##  [1] "CBARP"    "CHMP1B"   "DACT1"    "TNFRSF1B" "INHBA"    "STC1"    
##  [7] "IL1R1"    "PMEPA1"   "PRPSAP1"  "SOX4"     "CBLB"     "ETS2"    
## [13] "RASD2"    "TMEM120B" "NR4A3"    "SLC16A6"  "CREM"     "NFATC2"  
## [19] "PITX2"    "MSC"      "MAFK"     "NEDD9"    "LIF"      "PDGFA"   
## [25] "NGF"      "PPP1R3C"  "IL4R"     "SPHK1"    "KLHL26"   "MAP3K4"  
## [31] "NKX3-1"   "SSBP3"    "SHROOM2"  "NHS"      "NFATC1"   "F2RL1"   
## [37] "TENT5A"   "MECOM"    "PITPNC1"  "NR4A2"   
## 
## $Module_13
##  [1] "CHRM2"      "CPED1"      "DSEL"       "SNX18"      "PRRX1"     
##  [6] "PRKCA"      "CPEB2"      "IGF2BP3"    "NRP1"       "CCDC50"    
## [11] "CAMK2D"     "CDC42EP3"   "PCDH18"     "FGD4"       "ELK3"      
## [16] "DENND10P1"  "CDC42EP3P1" "LIX1L"      "PRKAR1AP1"  "TBCK"      
## [21] "VCPIP1"     "NBEA"       "LINC00674"  "WASF3"      "WDR19"     
## [26] "SNAI2"      "PLAG1"      "SEC22B3P"   "SEC22B2P"   "FMN2"      
## [31] "PPP1R21"    "SATB1"      "RUNX2"      "FNIP1"      "NPEPPS"    
## [36] "PDPK1"      "MTCO1P2"    "ARL2BP"     "YWHAG"      "NOP56"     
## 
## $Module_14
##  [1] "KRTAP2-4" "KRTAP2-2" "KRTAP2-1" "KRTAP2-3" "FOSL1P1"  "FOSL1"   
##  [7] "PRAG1"    "FZD2"     "BCAR3"    "SEC14L1"  "NAMPT"    "CTH"     
## [13] "NAMPTP1"  "OSR1"     "PRDM1"    "PTPRE"    "AGFG2"    "DVL2"    
## [19] "MVK"      "AREG"     "FZD7"     "RYBP"     "FOSB"     "ATP8B1"  
## [25] "TMEM87A"  "TRAK1"    "FRY"      "NR3C2"    "JCAD"     "GRB10"   
## [31] "RBMS2P1"  "AMMECR1L" "CBX7"     "ZFAND5"   "ACVR1B"   "MAMSTR"  
## [37] "ELMO2"    "RIPK2"    "PIK3R1"   "SASH1"
# 2. Generating adjacency matrix for hub genes from all modules
hub_genes<-unname(unlist(network_hub_list))
hubs_m<-TOM[hub_genes, hub_genes]
edges<-data.frame(from=rownames(hubs_m)[row(hubs_m)[upper.tri(hubs_m)]], 
           to=colnames(hubs_m)[col(hubs_m)[upper.tri(hubs_m)]], 
           value=hubs_m[upper.tri(hubs_m)])

# 3. Setting up thredshold for selecting leading edges in the network
leading_edges_ratio = 0.20 #top 1/5 edges based on edge weight.
threshold = quantile(edges$value, probs=1-leading_edges_ratio , na.rm = FALSE)
edges<-edges[edges$value >= threshold,]

# 4. Organizing vertices data frame and edges data frame for the network
vertices<-data.frame()
for(i in 1:length(network_hub_list)){
  module_name<-names(network_hub_list)[i]
  v<-data.frame(name = network_hub_list[[i]], module = rep(module_name, nTOM), size = rep(8, nTOM))
  vertices<-rbind(vertices, v)
}

vertices[,"group"]<-unlist(lapply(vertices$module, function(x) strsplit(x, "_")[[1]][2]))
vertices[,"id"]<-rownames(vertices)

# 5. Centrality analysis of hub genes co-expression network
g<-graph_from_data_frame(edges, directed = F, vertices)
eigenvector<-eigen_centrality(g)$vector
betweenness<-betweenness(g)
degree<-degree(g)
closeness<-closeness(g)
centralities <- cbind(degree, eigenvector, betweenness, closeness)
centrality_df<-cbind(centralities, vertices$module)

DT::datatable(centrality_df) #show centrality data of nodes for sorting
# 6. Visualize the hub genes co-expression network
edges[,"source"]<-vertices[match(edges$from, vertices$name), "id"]
edges[,"target"]<-vertices[match(edges$to, vertices$name), "id"]
edges$source<-as.numeric(edges$source)
edges$target<-as.numeric(edges$target)

edges<-data.frame(source = edges$source-1, target = edges$target-1, value = edges$value)
vertices<-cbind(vertices, centralities)
vertices[,"betweenness_sqrt"]<-sqrt(unlist(vertices$betweenness))

color_df<-data.frame(color = color_pal, module = c(14,1,11,13,5,10,12,4,3,8,6,9,7,2))
color_df<-color_df[order(as.numeric(sub("\\D+", "", color_df$module))),] #match the module color with previous figures.

ColourScale <- 'd3.scaleOrdinal()
            .domain(["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"])
           .range(["#9ECAE1", "#6A51A3", "#FDAE6B", "#FEE6CE", "#BAE4B3", "#F2F0F7", "#9E9AC8", "#E6550D", "#CBC9E2", "#74C476", "#3182BD", "#238B45", "#EDF8E9", "#DEEBF7"]);'

fn<-forceNetwork(Links = edges, Nodes = vertices,
                 Source = "source", Target = "target",
                 Nodesize = "betweenness_sqrt", fontSize = 12,
                 Value = "value", NodeID = "name",
                 Group = "group", opacity = 1,
                 legend = T, charge = -20, zoom = F,
                 opacityNoHover = 1,
                 colourScale= JS(ColourScale),
                 linkWidth = JS("function(d) { return Math.cbrt(d.value)*0.4; }")
                 )
fn

IMR90 GRN analysis

Creating GRN for IMR90 ER (29 samples) and GFP (27 samples) (PANDA + LIONESS)

Generating input data for creating PANDA network

IMR90_normalized_exp<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_processed/IMR90_realigned_normalized_data.RDS")
IMR90_ppi<-read.table("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/ppi_imr90.txt", sep = "\t", quote = "")
IMR90_motif<-read.table("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/motif_imr90.txt", sep = "\t", quote = "")

IMR90_ALL<-as.data.frame(IMR90_normalized_exp$E)
IMR90_gene<-IMR90_normalized_exp$genes
IMR90_gene<-IMR90_gene[,c("ensembl_gene_id", "hgnc_symbol")]

IMR90_ALL[,"ensembl_gene_id"]<-rownames(IMR90_ALL)
IMR90_ALL<-left_join(IMR90_ALL, IMR90_gene, by = "ensembl_gene_id")
IMR90_ALL<-IMR90_ALL[!IMR90_ALL$hgnc_symbol == "",]
IMR90_ALL<-IMR90_ALL[!duplicated(IMR90_ALL$hgnc_symbol),]
IMR90_ALL<-IMR90_ALL[!is.na(IMR90_ALL$hgnc_symbol),]
rownames(IMR90_ALL)<-IMR90_ALL$hgnc_symbol
IMR90_ALL<-IMR90_ALL[IMR90_ALL$hgnc_symbol %in% unique(c(IMR90_motif$V1, IMR90_motif$V2)), ]
IMR90_ALL<-IMR90_ALL[, 1:(ncol(IMR90_ALL)-2)]

IMR90_ER<-IMR90_ALL[, grep("ER", colnames(IMR90_ALL))]
#write.table(IMR90_ER, file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/IMR90_analysis/IMR90_ER.txt", sep = "\t", quote = F, row.names = F, col.names = F)

IMR90_GFP<-IMR90_ALL[, grep("GFP", colnames(IMR90_ALL))]
#write.table(IMR90_GFP, file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/IMR90_analysis/IMR90_GFP.txt", sep = "\t", quote = F, row.names = F, col.names = F)

Running PANDA and LIONESS in matlab to create single sample netowrk for samples in ER and GFP group

addpath(genpath('/home/u16/jiawenyang/Matlab/netZooM'))

%set program parameters
exp_file  = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/imr90_er.txt';
motif_file = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/motif_imr90.txt';
ppi_file = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/ppi_imr90.txt';
panda_out = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/IMR90_ER_panda.mat';%optional
save_temp = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/';
TF_out = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/TF_out.txt';
GENE_out = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/GENE_out.txt';
alpha      = 0.1;

%%
fid = fopen(exp_file, 'r');
headings = fgetl(fid);
n = length(regexp(headings, '\t'));
frewind(fid);
Exp = textscan(fid, ['%s', repmat('%f', 1, n)], 'delimiter', '\t');
fclose(fid);
GeneNames = Exp{1};
Exp = cat(2, Exp{2:end});
[NumGenes, NumConditions] = size(Exp);
fprintf('%d genes and %d conditions!\n', NumGenes, NumConditions);
Exp = Exp'; %transpose expression matrix from gene-by-sample to sample-by-gene

%%
disp('Reading in motif data!');
[TF, gene, weight] = textread(motif_file, '%s%s%f');
TFNames = unique(TF);
NumTFs = length(TFNames);
[~,i] = ismember(TF, TFNames);
[~,j] = ismember(gene, GeneNames);
RegNet = zeros(NumTFs, NumGenes);
RegNet(sub2ind([NumTFs, NumGenes], i, j)) = weight;
fprintf('%d TFs and %d edges!\n', NumTFs, length(weight));

%%
disp('Reading in ppi data!');
TFCoop = eye(NumTFs);
if(~isempty(ppi_file))
    [TF1, TF2, weight] = textread(ppi_file, '%s%s%f');
    [~,i] = ismember(TF1, TFNames);
    [~,j] = ismember(TF2, TFNames);
    TFCoop(sub2ind([NumTFs, NumTFs], i, j)) = weight;
    TFCoop(sub2ind([NumTFs, NumTFs], j, i)) = weight;
    fprintf('%d PPIs!\n', length(weight));
end

%%
%% ============================================================================
%% Run PANDA
%% ============================================================================
disp('Computing coexpresison network:');
tic; GeneCoReg = Coexpression(Exp); toc;

disp('Normalizing Networks:');
tic
    RegNet = NormalizeNetwork(RegNet);
    GeneCoReg = NormalizeNetwork(GeneCoReg);
    TFCoop = NormalizeNetwork(TFCoop);
toc
disp('Running Panda algorithm:');
tic; AgNet = PANDA(RegNet, GeneCoReg, TFCoop, alpha); toc;

%%
%save all panda out put

if ~isempty(panda_out)
    disp('Saving PANDA network!');
    tic
        [pathstr, name, ext] = fileparts(panda_out);
        switch ext
            case '.txt'
                save(panda_out, 'AgNet', '-ascii');
            case '.tsv'
                save(panda_out, 'AgNet', '-ascii', '-tabs');
            otherwise
                save(panda_out, 'AgNet', '-v6');

        end
    toc
end

%%
%save all temp files
if ~isempty(save_temp)
    disp('Saving the transposed expression matrix and normalized networks:');
    if ~exist(save_temp, 'dir')
        mkdir(save_temp);
    end
    tic
        save(fullfile(save_temp, 'expression.transposed.mat'), 'Exp', '-v7.3');  % 2G+
        save(fullfile(save_temp, 'motif.normalized.mat'), 'RegNet', '-v6');  % fast
        save(fullfile(save_temp, 'ppi.normalized.mat'), 'TFCoop', '-v6');  % fast
    toc
end

fout=fopen(TF_out,'w');
fprintf(fout,'%s\n',string(TFNames));

fout=fopen(GENE_out,'w');
fprintf(fout,'%s\n',string(GeneNames));
%%
disp('All done!');
disp(datestr(now));


%% ============================================================================
%% Load the PANDA outputs for LIONESS
%% ============================================================================

%set program parameters
exp_file  = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/expression.transposed.mat';
motif_file = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/motif.normalized.mat';
ppi_file = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/ppi.normalized.mat';
panda_file = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/panda/imr90_er/IMR90_ER_panda.mat';
save_dir   = '/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/lioness/imr90_er';  % output dir for LIONESS networks
lib_path   = '/home/u16/jiawenyang/Matlab/netZooM';
alpha      = 0.1;
START      = 1;  % sample-of-interest starting from this index
END        = -1;  % sample-of-interest ending to this index; use -1 to end at the last sample
ascii_out  = 1;  % set to 1 if you prefer text output file instead of MAT-file
disp(datestr(now));

%%
%AgNet = AgNet';

%%
disp('Reading in expression data!');
tic
    X = load(exp_file);
    Exp = X.Exp;
    [NumConditions, NumGenes] = size(Exp);  % transposed expression
    fprintf('%d genes and %d conditions!\n', NumGenes, NumConditions);
toc
%%
disp('Reading in motif data!');
tic
    X = load(motif_file);
    RegNet = X.RegNet;
toc
%%
disp('Reading in ppi data!');
tic
    X = load(ppi_file);
    TFCoop = X.TFCoop;
toc
%%
disp('Reading in PANDA network!');
tic
    X = load(panda_file);
    AgNet = X.AgNet;
toc
    
%% ============================================================================
%% Run LIONESS
%% ============================================================================
addpath(genpath('/home/u16/jiawenyang/Matlab/netZooM'))
      
% Create the output folder if not exists
if ~exist(save_dir, 'dir')
    mkdir(save_dir);
end

%% Sample indexes to iterate
if END == -1
    indexes = START:NumConditions;
else
    indexes = START:END;
end

for i = indexes
    fprintf('Running LIONESS for sample %d:\n', i);
    idx = [1:(i-1), (i+1):NumConditions];  % all samples except i

    disp('Computing coexpresison network:');
    tic; GeneCoReg = Coexpression(Exp(idx,:)); toc;

    disp('Normalizing Networks:');
    tic; GeneCoReg = NormalizeNetwork(GeneCoReg); toc;

    disp('Running PANDA algorithm:');
    LocNet = PANDA(RegNet, GeneCoReg, TFCoop, alpha);
    PredNet = NumConditions * (AgNet - LocNet) + LocNet;

    disp('Saving LIONESS network:');
    if ascii_out == 1
        f = fullfile(save_dir, sprintf('lioness_%d.txt', i));
        tic; save(f, 'PredNet', '-ascii'); toc;
    else
        f = fullfile(save_dir, sprintf('lioness_%d.mat', i));
        tic; save(f, 'PredNet', '-v6'); toc;
    end
    fprintf('Network saved to %s\n', f);

    clear idx GeneCoReg LocNet PredNet f; % clean up for next run
end

disp('All done!');
disp(datestr(now));

And then, run the code above for IMR90-GFP samples.

Organizing the LIONESS outputs in R

TF_order<-read.table("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/panda/imr90_er/TF_out.txt")
Gene_order<-read.table("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/panda/imr90_er/GENE_out.txt")

conditions<-c("er", "gfp")

IMR90_lioness<-list()
for (k in 1:length(conditions)){
  condition<-conditions[k]
  print(condition)
  FILES<-list.files(path = paste0("./imr90_", condition, "/"), pattern = "*.txt", full.names = T)
  mat_sample_er<-scan(FILES[1])
  sample_matrix<-matrix(mat_sample_er, nrow = nrow(TF_order), ncol = nrow(Gene_order))
  rownames(sample_matrix)<-TF_order$V1
  colnames(sample_matrix)<-Gene_order$V1
  tf_gene_lioness_weight<-reshape2::melt(sample_matrix)
  colnames(tf_gene_lioness_weight)<-c("TF", "Gene", paste0(condition,"1"))
  for (i in 2:length(FILES)){
    id<-FILES[i]
    id<-strsplit(id, "_")[[1]][3]
    id<-strsplit(id, "[.]")[[1]][1]
    mat<-scan(FILES[i])
    matrix_i<-matrix(mat, nrow = nrow(TF_order), ncol = nrow(Gene_order))
    rownames(matrix_i)<-TF_order$V1
    colnames(matrix_i)<-Gene_order$V1
    tf_gene_edges<-reshape2::melt(matrix_i)
    df<-data.frame(v1 = tf_gene_edges[,3])
    colnames(df)<-paste0(condition,id)
    tf_gene_lioness_weight<-cbind(tf_gene_lioness_weight, df)
  }
  IMR90_lioness[[condition]]<-tf_gene_lioness_weight
}
saveRDS(IMR90_lioness, file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn/IMR90_lioness_R_output_organized.RDS")

Organizing the lioness by time period

DATA_DIR<-c("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn")
coldata<-data.frame(samples = colnames(IMR90_normalized_exp), order = as.character(c(1:27, 1:30, 1:24, 1:29)), group = c(rep("GFP", 27), rep("ST", 30), rep("TLT", 24), rep("ER", 29)))

coldata_new<-coldata
coldata_new[,"time"]<-unlist(lapply(coldata_new$samples, function(X) strsplit(X, "_")[[1]][2]))

t1<-c(0,4)
t2<-c(8,12)
t3<-c(16, 20)
t4<-c(24, 32)
t5<-c(40, 48)

time_list_5t_period<-list(t1 = t1, t2 = t2, t3 = t3, t4 = t4, t5 = t5)

IMR90_lioness_t5_mean<-list()
for (i in 1:length(IMR90_lioness)){
  condition<-names(IMR90_lioness)[i]
  df<-IMR90_lioness[[condition]]
  time_period_mean<-IMR90_lioness$IMR90_lioness_gfp[,1:2]
  for (k in 1:length(time_list_5t_period)){
    time_points<-time_list_5t_period[[k]]
    time_points_samples<-coldata_new[coldata_new$time %in% time_points, "samples"]
    subdf<-df[, colnames(df) %in% time_points_samples]
    average_value<-apply(subdf, 1, mean)
    name<-names(time_list_5t_period)[k]
    subdf<-as.data.frame(average_value)
    colnames(subdf)<-name
    time_period_mean<-cbind(time_period_mean, subdf)
  }
  IMR90_lioness_t5_mean[[condition]]<-time_period_mean
}
saveRDS(IMR90_lioness_t5_mean, file = paste0(DATA_DIR, "/IMR90_lioness_t5_period.RDS"))

Detecting differential communities between ER and GFP for each time period (ALPACA)

IMR90_lioness_t5_mean<-readRDS(file = paste0(DATA_DIR, "/IMR90_lioness_t5_period.RDS"))

time_conditions2<-c("t1", "t2", "t3", "t4", "t5")
for (i in 2:length(IMR90_lioness_t5_mean)){
  exp_condition<-names(IMR90_lioness_t5_mean)[i]
  exp_condition<-gsub("IMR90_lioness_", "", exp_condition)
  conditioned_net<-IMR90_lioness_t5_mean[[i]]
  for (k in 1:length(time_conditions2)){
    time_condition<-time_conditions2[k]
    conditioned_net_one_time<-conditioned_net[, c("TF", "Gene", time_condition)]
    colnames(conditioned_net_one_time)<-c("TF", "Gene", "Score")
    conditioned_net_one_time[,"V1"]<-IMR90_lioness_t5_mean$IMR90_lioness_gfp[,time_condition]
    control_net_one_time<-conditioned_net_one_time[, c("TF", "Gene", "V1", "Score")]
    colnames(control_net_one_time)<-c("TF", "Gene", "V1", "V2")
    control_net_one_time[,c("V1", "V2")]<-log(exp(1)^control_net_one_time[,c("V1", "V2")]+1, base = exp(1)) #transform the edge weights so that avoid negative values
    net.table<-as.data.frame(control_net_one_time)
    netZooR::alpaca(net.table, file.stem = paste0(DATA_DIR, "/t5_period_transformed/",exp_condition, "_vs_gfp_",time_condition), verbose = T)
  }
}

Generating sankey plot to visualize the dynamic of gene regulatory network modules

DATA_DIR<-c("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_grn")
# 1.Creating the function that used to create the plots
sanktify <- function(x) {

  # Create nodes DF with the unique sources & targets from input
  nodes <- data.frame(unique(c(x$source,x$target)),stringsAsFactors=FALSE)
  nodes$ID <- as.numeric(rownames(nodes)) - 1 # sankeyNetwork requires IDs to be zero-indexed
  names(nodes) <- c("name", "ID")

  # use dplyr join over merge since much better; in this case not big enough to matter
  # Replace source & target in links DF with IDs
  links <- inner_join(x, nodes, by = c("source"="name"))
  links<- inner_join(links, nodes, by = c("target"="name"))
  colnames(links)<-c("source", "target", "value", "source_ID", "target_ID")

  # Create Sankey Plot
  sank <- sankeyNetwork(
    Links = links,
    Nodes = nodes,
    Source = "source_ID",
    Target = "target_ID",
    Value = "value",
    NodeID = "name",
    fontSize = 10,
    nodeWidth = 40
  )
  return(sank)
}

# 2.Organizing the ALPACA outputs from different time periods for Sankey plot
FILES<-list.files(path = paste0(DATA_DIR, "/t5_period_transformed"), 
                  pattern = "*_ALPACA_final_memb.txt",
                  full.names = T,
                  recursive = T)

alpaca_t5_period<-lapply(FILES, function(x) read.table(x, sep = "", quote = ""))
names(alpaca_t5_period)<-c("t1", "t2", "t3","t4", "t5")

alpaca_t5_period_df<-left_join(alpaca_t5_period$t1, alpaca_t5_period$t2, by = "V1")
alpaca_t5_period_df<-left_join(alpaca_t5_period_df, alpaca_t5_period$t3, by = "V1")
alpaca_t5_period_df<-left_join(alpaca_t5_period_df, alpaca_t5_period$t4, by = "V1")
alpaca_t5_period_df<-left_join(alpaca_t5_period_df, alpaca_t5_period$t5, by = "V1")
colnames(alpaca_t5_period_df)<-c("Gene", "t1", "t2", "t3", "t4", "t5")

n = 30 #remove the modules whose size are less than 30
remove_genes_list<-c()
for (i in 1:5){
  time_period<-paste0("t", i)
  df<- as.data.frame(table(alpaca_t5_period_df[,time_period]))
  module_number<-df[,1][df[,2] <= n]
  remove_genes<-alpaca_t5_period_df[alpaca_t5_period_df[,time_period] %in% module_number, "Gene"]
  remove_genes_list<-c(remove_genes_list, remove_genes)
}

alpaca_t5_period_df_new<-alpaca_t5_period_df[!alpaca_t5_period_df[, "Gene"] %in% remove_genes_list,]


alpaca_t5_period_t1<-as.data.frame(table(alpaca_t5_period_df_new$t1))
alpaca_t5_period_t2<-as.data.frame(table(alpaca_t5_period_df_new$t2))
alpaca_t5_period_t3<-as.data.frame(table(alpaca_t5_period_df_new$t3))
alpaca_t5_period_t4<-as.data.frame(table(alpaca_t5_period_df_new$t4))
alpaca_t5_period_t5<-as.data.frame(table(alpaca_t5_period_df_new$t5))

t1_to_t2<-data.frame()
for (i in 1:length(unique(alpaca_t5_period_t1$Var1))){
   from_node<-unique(alpaca_t5_period_t1$Var1)[i]
   z<-data.frame()
   for(k in 1:length(unique(alpaca_t5_period_t2$Var1))){
     to_node<-unique(alpaca_t5_period_t2$Var1)[k]
     value = nrow(alpaca_t5_period_df_new[alpaca_t5_period_df_new$t1 == from_node & alpaca_t5_period_df_new$t2 == to_node,])
     df<-data.frame(source = paste0("t1_module_", from_node), target = paste0("t2_module_",to_node), value = value)
     z = rbind(z, df)
   }
  t1_to_t2<-rbind(t1_to_t2, z)
}

t2_to_t3<-data.frame()
for (i in 1:length(unique(alpaca_t5_period_t2$Var1))){
   from_node<-unique(alpaca_t5_period_t2$Var1)[i]
   z<-data.frame()
   for(k in 1:length(unique(alpaca_t5_period_t3$Var1))){
     to_node<-unique(alpaca_t5_period_t3$Var1)[k]
     value = nrow(alpaca_t5_period_df_new[alpaca_t5_period_df_new$t2 == from_node & alpaca_t5_period_df_new$t3 == to_node,])
     df<-data.frame(source = paste0("t2_module_", from_node), target = paste0("t3_module_",to_node), value = value)
     z = rbind(z, df)
   }
  t2_to_t3<-rbind(t2_to_t3, z)
}

t3_to_t4<-data.frame()
for (i in 1:length(unique(alpaca_t5_period_t3$Var1))){
   from_node<-unique(alpaca_t5_period_t3$Var1)[i]
   z<-data.frame()
   for(k in 1:length(unique(alpaca_t5_period_t4$Var1))){
     to_node<-unique(alpaca_t5_period_t4$Var1)[k]
     value = nrow(alpaca_t5_period_df_new[alpaca_t5_period_df_new$t3 == from_node & alpaca_t5_period_df_new$t4 == to_node,])
     df<-data.frame(source = paste0("t3_module_", from_node), target = paste0("t4_module_",to_node), value = value)
     z = rbind(z, df)
   }
  t3_to_t4<-rbind(t3_to_t4, z)
}

t4_to_t5<-data.frame()
for (i in 1:length(unique(alpaca_t5_period_t4$Var1))){
   from_node<-unique(alpaca_t5_period_t4$Var1)[i]
   z<-data.frame()
   for(k in 1:length(unique(alpaca_t5_period_t5$Var1))){
     to_node<-unique(alpaca_t5_period_t5$Var1)[k]
     value = nrow(alpaca_t5_period_df_new[alpaca_t5_period_df_new$t4 == from_node & alpaca_t5_period_df_new$t5 == to_node,])
     df<-data.frame(source = paste0("t4_module_", from_node), target = paste0("t5_module_",to_node), value = value)
     z = rbind(z, df)
   }
  t4_to_t5<-rbind(t4_to_t5, z)
}

t_all<-Reduce(bind_rows, list(t1_to_t2, t2_to_t3, t3_to_t4, t4_to_t5))
sanktify(t_all)

Applying threshold to gene modules and performing go enrichment analysis on gene modules from different time periods

time_points<-c("t1", "t2", "t3", "t4", "t5")
ER_modules_enrichment_result_t5<-list()
for (i in 1:5){
  time_point<-time_points[i]
  alpaca_community<-read.table(FILES[i], sep = "", quote = "")
  alpaca_community<-alpaca_t5_period_df_new[,c("Gene", time_point)]
  alpaca_community[,"gene"]<-unlist(lapply(alpaca_community[,"Gene"], function(x) strsplit(x, "_")[[1]][1]))
  alpaca_community[,"type"]<-unlist(lapply(alpaca_community[,"Gene"], function(x) strsplit(x, "_")[[1]][2]))
  alpaca_community<-alpaca_community[,c("gene", "type", time_point)]
  colnames(alpaca_community)<-c("gene", "type", "community")
  ER_modules_enrichment_result<-list()
    for (k in 1:length(unique(alpaca_community$community))){
      module<-unique(alpaca_community$community)[k]
      gene_list<-alpaca_community[alpaca_community$community == module, "gene"]
      label<-paste0("er_vs_gfp_",time_point, "_module_", module)
      print(label)
      if (length(gene_list) < 10){
        print(gene_list) 
        print("There are too few genes in the list to perform enrichment analysis")
      }
      else {
        ER_BP<-enrichGO(gene_list, 
                         OrgDb = org.Hs.eg.db, 
                         ont = "BP",
                         pAdjustMethod = "BH", 
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.25, 
                         keyType = 'SYMBOL')
        #datatable(ER_BP@result)
        ER_modules_enrichment_result[[label]]<-ER_BP@result
      }
     }
 ER_modules_enrichment_result_t5<-c(ER_modules_enrichment_result_t5, ER_modules_enrichment_result)
}
time_points<-c("t1", "t2", "t3", "t4", "t5")
ER_modules_enrichment_result_t5_simplified<-list()
for (i in 1:5){
  time_point<-time_points[i]
  alpaca_community<-read.table(FILES[i], sep = "", quote = "")
  alpaca_community<-alpaca_t5_period_df_new[,c("Gene", time_point)]
  alpaca_community[,"gene"]<-unlist(lapply(alpaca_community[,"Gene"], function(x) strsplit(x, "_")[[1]][1]))
  alpaca_community[,"type"]<-unlist(lapply(alpaca_community[,"Gene"], function(x) strsplit(x, "_")[[1]][2]))
  alpaca_community<-alpaca_community[,c("gene", "type", time_point)]
  colnames(alpaca_community)<-c("gene", "type", "community")
  ER_modules_enrichment_result<-list()
    for (k in 1:length(unique(alpaca_community$community))){
      module<-unique(alpaca_community$community)[k]
      gene_list<-alpaca_community[alpaca_community$community == module, "gene"]
      label<-paste0("er_vs_gfp_",time_point, "_module_", module)
      print(label)
      if (length(gene_list) < 10){
        print(gene_list) 
        print("There are too few genes in the list to perform enrichment analysis")
      }
      else {
        ER_BP<-enrichGO(gene_list, 
                         OrgDb = org.Hs.eg.db, 
                         ont = "BP",
                         pAdjustMethod = "BH", 
                         pvalueCutoff = 0.05,
                         qvalueCutoff = 0.25, 
                         keyType = 'SYMBOL')
        #datatable(ER_BP@result)
        ER_BP_simplified<-clusterProfiler::simplify(ER_BP, cutoff = 0.5, by = "p.adjust", select_fun = min)
        ER_modules_enrichment_result[[label]]<-ER_BP_simplified@result
      }
     }
 ER_modules_enrichment_result_t5_simplified<-c(ER_modules_enrichment_result_t5_simplified, ER_modules_enrichment_result)
}

Selecting top terms from the GO TERM analysis result in each gene module

ER_modules_enrichment_result_t5_sig<-list()
for (i in 1:length(ER_modules_enrichment_result_t5)){
  condition<-names(ER_modules_enrichment_result_t5)[i]
  go_result<-ER_modules_enrichment_result_t5[[condition]]
  go_result<-go_result[go_result$p.adjust <= 0.05, ]
  go_result<-go_result[order(go_result$p.adjust, decreasing = F), ][1:100,]
  go_result<-go_result[order(go_result$Count, decreasing = T), ][1:15,]
  go_result<-go_result[order(go_result$p.adjust, decreasing = T), ][1:10,]
  go_result<-go_result[go_result$Description != "<NA>",]
  go_result<-na.omit(go_result)
  if (is.na(go_result[1,"ID"]) == "FALSE") {
  ER_modules_enrichment_result_t5_sig[[condition]]<-go_result}
}


#t5 module 1
DT::datatable(ER_modules_enrichment_result_t5_sig$er_vs_gfp_t5_module_1)
#t4 module 1
DT::datatable(ER_modules_enrichment_result_t5_sig$er_vs_gfp_t4_module_1)
#t3 module 1
DT::datatable(ER_modules_enrichment_result_t5_sig$er_vs_gfp_t3_module_1)
#t2 module 1
DT::datatable(ER_modules_enrichment_result_t5_sig$er_vs_gfp_t2_module_1)
#t1 module 1
DT::datatable(ER_modules_enrichment_result_t5_sig$er_vs_gfp_t1_module_1)

See attached supplemental table for enriched go terms in all modules from different time periods : ER_modules_enrichment_result_t5_sig.xlsx

Network analyses in GSE39612 (MCC tumor dataset)

Importing data

#Read GSE39612 data ============================================
edat<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/renormalized_samples_edat_no_celllines.RDS")
sample_list_renormalized<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/renormalized_samples_list_no_celllines.RDS")
GSE39612_annotation<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/GSE39612_annotation.RDS")
MCC_tumor_matrix<-edat[ ,colnames(edat) %in% c(rownames(GSE39612_annotation[GSE39612_annotation$`tissue:ch1` == "Merkel cell carcinoma",]))]

Performing DEG analysis on normal skin samples and MCC tumor samples

#DEGs for normal vs. MCC tumor samples ============================================
group<-factor(c(rep("normal", 64),  rep("tumor", 30)))
names(group)<-sample_list_renormalized$ID
design = model.matrix(~0 + group)
comparison = "grouptumor-groupnormal"
cont=makeContrasts(comparison,levels=design)
fit=lmFit(edat,design)
fit=contrasts.fit(fit,contrasts=cont)
tfit=treat(fit)
efit=eBayes(tfit)
tdf=topTable(efit,coef=1, n = Inf, adjust = "BH")

Performing GO term over-representation analysis on DEGs from normal skin vs. MCC tumors comparison

gse39612_mcc.vs.normal_sig<-tdf[abs(tdf$logFC) >=1 & abs(tdf$adj.P.Va) <= 0.05,]
genes<-rownames(gse39612_mcc.vs.normal_sig)
go_term<-enrichGO(genes,
                  OrgDb = org.Hs.eg.db, 
                  ont='BP',
                  pAdjustMethod = 'BH',
                  pvalueCutoff = 0.05, 
                  qvalueCutoff = 0.25,
                  keyType = 'SYMBOL')
df<-go_term@result

Plotting enriched go terms

go_term<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/GSE39612_tumor_vs_normal_GO_enrichment_result.rds")
p<-dotplot(go_term, showCategory = 20, font.size = 20, x = "Count")
print(p)

Plotting heatmap for Wnt signaling pathway genes from DEGs

wnt_signaling_genes<-strsplit(go_term@result[go_term@result$Description == "Wnt signaling pathway","geneID"][1], "/")[[1]]
wnt_signaling_genes_scaled<-t(scale(t(edat[wnt_signaling_genes, sample_list_renormalized$ID])))
wnt_signaling_genes_scaled<-na.omit(wnt_signaling_genes_scaled)

GSE39612_annotation<-GSE39612_annotation[sample_list_renormalized$ID,]
GSE39612_annotation<-GSE39612_annotation[,c("tissue:ch1", "merkel cell polyomavirus status (dna and rna):ch1", "metastasis, primary, or cell line:ch1")]
colnames(GSE39612_annotation)<-c("group", "MCPyV_status", "MCC_status")
GSE39612_annotation$MCPyV_status[!GSE39612_annotation$MCPyV_status %in% c("negative", "positive")]<-"not_available"
GSE39612_annotation$MCC_status[!GSE39612_annotation$MCC_status %in% c("primary tumor", "metastatic tumor")]<-"not_available"

annotation_GSE39612_WNT = HeatmapAnnotation(
  group = GSE39612_annotation$group,
  MCC_status = GSE39612_annotation$MCC_status,
  MCPyV_status = GSE39612_annotation$MCPyV_status,
  col = list(group= c(`normal skin` = "#B7C9F3", `Merkel cell carcinoma` = "#A4312A"),
             MCC_status = c(`not_available` = "#C9CFD0", `metastatic tumor` = "#C47BFC", `primary tumor` = "#01BDC2"),
             MCPyV_status = c(`not_available` = "#C9CFD0", `negative` = "#CCFF99", `positive` = "#FF8000")
  ),
  simple_anno_size = unit(4, "mm"),
  annotation_name_gp= gpar(fontsize = 10, fontface = "bold", col = "black"))

group_column_GSE39612 = factor(GSE39612_annotation$group, levels = c("normal skin", "Merkel cell carcinoma"))

Heatmap(wnt_signaling_genes_scaled,
        heatmap_legend_param = list(title = "Z-scored", title_position = "topleft", legend_height = unit(8, "cm"), legend_direction = "vertical"),
        row_title = "genes",
        cluster_columns = cluster_within_group(wnt_signaling_genes_scaled, group_column_GSE39612),
        column_dend_side = "top",
        show_column_dend = T,
        column_title = "GSE39612 samples",
        show_column_names = F,
        column_title_side = "top",
        column_title_gp = gpar(fontsize = 18, fontface = "bold", col="black"),
        row_names_gp = gpar(fontsize = 10, fontface = "bold"),
        top_annotation = annotation_GSE39612_WNT,
        width = ncol(wnt_signaling_genes_scaled)*unit(3, "mm"), 
        height = nrow(wnt_signaling_genes_scaled)*unit(3, "mm"),
        border = T
)

Performing WGCNA on MCC tumor samples

nettype = "signed"
powers = c(c(1:10), seq(from = 12, to=20, by=2))

MCC_expression_data<-edat[, rownames(GSE39612_annotation[GSE39612_annotation$group=="Merkel cell carcinoma",])]

sft_MCC=pickSoftThreshold(t(MCC_expression_data), dataIsExpr = TRUE, powerVector = powers, networkType = nettype, verbose = 5)

# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft_MCC$fitIndices[,1], -sign(sft_MCC$fitIndices[,3])*sft_MCC$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab= paste0("Scale Free Topology Model Fit ", nettype, " R^2"), type="n",
     main = paste("Scale independence"))+text(sft_MCC$fitIndices[,1], -sign(sft_MCC$fitIndices[,3])*sft_MCC$fitIndices[,2],
     labels=powers,cex=1,col="red")+abline(h=0.90,col="red")

# Mean connectivity as a function of the soft-thresholding power
plot(sft_MCC$fitIndices[,1], sft_MCC$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))+text(sft_MCC$fitIndices[,1], sft_MCC$fitIndices[,5], labels=powers, cex=1, col="red")

##Create co-expression matrix=========================
cor <- WGCNA::cor
net_MCC = blockwiseModules(
  t(MCC_expression_data),
  power = 16 , #https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html Q#6 or based on powerEstimate
  TOMType = "signed", 
  minModuleSize = 30,   # We like large modules, so we set the minimum module size relatively high
  reassignThreshold = 0, 
  mergeCutHeight = 0.25,
  numericLabels = TRUE, 
  pamRespectsDendro = FALSE,
  saveTOMs = F,
  verbose = 3
)
table(net_MCC$colors)
cor<-stats::cor

Calculating co-expressed gene modules

MCC_expression_data<-edat[, rownames(GSE39612_annotation[GSE39612_annotation$group=="Merkel cell carcinoma",])]
net_MCC<-readRDS(file = "/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/WGCNA_net_mcc.rds")

moduleLabels = net_MCC$colors
MEs = net_MCC$MEs;

# plot Eigengene adjacency heatmap
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 30)

Performing GO-terms analysis on each gene module from MCC tumor samples

#1.Organizing genes in each WGCNA module for GO-term analysis
genemart<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_wgcna/genemart.rds")
probes = colnames(t(MCC_expression_data))
g_list<-getBM(filters = "hgnc_symbol", 
                       attributes= c("ensembl_gene_id","hgnc_symbol", "entrezgene_id"),values=probes,mart= genemart)
gene_list<-list()
for (i in 1:length(unique(moduleLabels))){
  number<-unique(moduleLabels)[i]
  module_name<-paste0("Module_", number)
  modnumber<-probes[moduleLabels == number]
  df_genes<-g_list[g_list$hgnc_symbol %in% modnumber,"ensembl_gene_id"]
  gene_list[[module_name]]<-df_genes
}

gene_df<-list()
module_name_list<-paste0("Module_", c(1:14))
for (i in 1:14){
  module_name<-module_name_list[i] #remove Module_0, since it's the module with genes that do not co-express with others
  genes<-gene_list[[module_name]]
  df_genes<-g_list[g_list$ensembl_gene_id %in% genes,]
  df_genes<-df_genes[!duplicated(df_genes$hgnc_symbol),]
  gene_df[[module_name]]<-df_genes
}
#2. Performing GO-term over-representation analysis on genes in each module
WGCNA_modules_go_result<-list()
for (i in 1:length(gene_df)){
  module_name<-names(gene_df)[i]
  genes<-gene_df[[module_name]]
  genes<-genes$hgnc_symbol
  em_ER_BP<-enrichGO(genes, 
                     OrgDb = org.Hs.eg.db, 
                     ont = "BP",
                   pAdjustMethod = "BH", 
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.25, 
                     keyType = 'SYMBOL')
  go_terms<-em_ER_BP@result
  go_terms<-go_terms[go_terms$p.adjust <= 0.05,]
  go_terms<-go_terms[order(go_terms$Count, decreasing = T),]
  print(module_name)
 WGCNA_modules_go_result[[module_name]]<-em_ER_BP@result
}

Performing network centrality analysis and network visualization of gene modules from WGCNA

adjMatrix <- adjacency(t(MCC_expression_data), power = 16, type = "signed")
TOM = TOMsimilarity(adjMatrix)
TOM = readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/gse39612/GSE39612_WGCNA_TOM.rds")
probes = colnames(t(MCC_expression_data))
dimnames(TOM)<-list(probes, probes)

# 1.Selecting top hub genes from each gene module
network_hub_list<-list()
nTOM = 40 #top nodes from the WGCNA network
for (i in 1:length(gene_df)){
  module<-names(gene_df)[i]
  genes<-gene_df[[module]][,"hgnc_symbol"]
  m<-TOM[genes, genes]
  hubs<-names(sort(rowSums(m), decreasing = T)[1:nTOM])
  hubs<-na.omit(hubs)
  network_hub_list[[module]]<-hubs
}

network_hub_list # The top 40 hubs in each module
## $Module_1
##  [1] "RO60"     "MED17"    "ANAPC7"   "MSH6"     "ZNF627"   "DNAAF2"  
##  [7] "PA2G4"    "NIF3L1"   "TRMT61B"  "TAF11"    "YEATS4"   "ENOPH1"  
## [13] "CCT6A"    "MSH2"     "SMARCA5"  "PRPF4B"   "METAP2"   "NFYB"    
## [19] "MAPKAPK5" "LARP7"    "NARS2"    "IREB2"    "YAE1"     "CDK4"    
## [25] "PSMD12"   "RBM12"    "SF3B6"    "AIMP1"    "TADA1"    "RPRD1A"  
## [31] "NUP42"    "SNW1"     "DDX18"    "PRPF40A"  "FBXW2"    "ASF1A"   
## [37] "PRELID3B" "HAUS1"    "PCMT1"    "SAE1"    
## 
## $Module_2
##  [1] "LINC00508"   "ST3GAL3"     "OR7E24"      "GTF2IP12"    "LINC00919"  
##  [6] "CLMAT3"      "LINC01844"   "ZNF527"      "MEFV"        "TINAGL1"    
## [11] "NECTIN3-AS1" "TRDN-AS1"    "NAA11"       "LINC00408"   "CSMD2-AS1"  
## [16] "ALDH1B1"     "SSX1"        "CTSE"        "ANHX"        "LRRC43"     
## [21] "ANKRD20A5P"  "FAM66D"      "PIGQ"        "DNAH3"       "LINC02053"  
## [26] "TNFRSF13C"   "HCCAT5"      "CBLIF"       "DSCR4"       "SLC18A1"    
## [31] "C22orf42"    "CYP1A2"      "TRPC7"       "IFNA7"       "GPA33"      
## [36] "E2F4"        "ERVH-1"      "LINC01333"   "SLC25A2"     "TTLL2"      
## 
## $Module_3
##  [1] "CD2"      "IKZF1"    "JAK3"     "ARHGAP30" "HCLS1"    "HLA-DMB" 
##  [7] "CD3D"     "IL2RB"    "APBB1IP"  "RAC2"     "CCL5"     "RCSD1"   
## [13] "IL2RG"    "GPR171"   "HCST"     "CCR5"     "ITGAL"    "FYB1"    
## [19] "CD8A"     "CD84"     "HLA-DOA"  "TRAF3IP3" "CXCL13"   "ITK"     
## [25] "ITGB2"    "CYBB"     "CD48"     "CD4"      "GMFG"     "GIMAP1"  
## [31] "GZMA"     "NLRC3"    "NKG7"     "CD86"     "MNDA"     "CD247"   
## [37] "CD53"     "PRF1"     "EVI2A"    "SLC7A7"  
## 
## $Module_4
##  [1] "KIF22"    "MPHOSPH9" "DLGAP5"   "SSRP1"    "CKS2"     "FEN1"    
##  [7] "NCAPG"    "TIMELESS" "SPAG5"    "MELK"     "PRR11"    "VRK1"    
## [13] "CDCA5"    "LIG1"     "TRIM37"   "CHAF1A"   "DTYMK"    "DDX55"   
## [19] "NDC80"    "TOP2A"    "PRPF19"   "CCNB1"    "RACGAP1"  "PAFAH1B3"
## [25] "PIN1"     "ZNF606"   "BIRC5"    "CBX2"     "EXOSC2"   "UBTF"    
## [31] "ZNF507"   "SAC3D1"   "POLR2D"   "TTK"      "PSMC3IP"  "KIF2C"   
## [37] "HJURP"    "CDCA7L"   "POLD1"    "ILKAP"   
## 
## $Module_5
##  [1] "PNLIPRP3"  "LYNX1"     "KLK10"     "ARG1"      "SERPINB13" "RNASE7"   
##  [7] "EPHX3"     "DSC1"      "IL36G"     "SERPINB7"  "LCE3D"     "TPRG1"    
## [13] "MIR205HG"  "WFDC5"     "HAL"       "FAM83C"    "KLK13"     "CYSRT1"   
## [19] "CDSN"      "DSG1"      "CYP4F22"   "DMKN"      "KLK8"      "IL20RB"   
## [25] "KLK7"      "IL36RN"    "PTK6"      "CAPNS2"    "SERPINB3"  "ALOX12B"  
## [31] "CRCT1"     "ASPRV1"    "IVL"       "ABCA12"    "SBSN"      "SERPINB4" 
## [37] "CALML3"    "AADACL2"   "GDPD3"     "BBOX1"    
## 
## $Module_6
##  [1] "LATS2"    "MMP2"     "PRRX1"    "COL6A2"   "PCOLCE"   "IFITM2"  
##  [7] "PMP22"    "FAP"      "DAB2"     "CTSK"     "CCDC80"   "CRISPLD2"
## [13] "MIR100HG" "PDGFRA"   "COPZ2"    "IL1R1"    "TMEM204"  "IGFBP4"  
## [19] "ANXA2"    "IFITM3"   "HEPH"     "FSTL1"    "COL6A3"   "ANGPTL2" 
## [25] "SERPINF1" "MMP19"    "RAB31"    "LRP1"     "GASK1B"   "OLFML3"  
## [31] "S100A10"  "MXRA5"    "PRDM1"    "CYBRD1"   "OLFML1"   "AEBP1"   
## [37] "EFEMP2"   "ANXA2P2"  "TMEM119"  "THBS2"   
## 
## $Module_7
##  [1] "ZBBX"      "OLIG2"     "LINC01561" "SHISA6"    "DRC1"      "GRIA1"    
##  [7] "MAGEA6"    "CRIP3"     "OLIG1"     "MAGEA11"   "C7orf57"   "TTC29"    
## [13] "MAGEA1"    "PALM3"     "MAGEA12"   "ROPN1L"    "MAGEA4"    "CFAP43"   
## [19] "KCNIP1"    "PDE10A"    "ARMC3"     "TBX5-AS1"  "CFAP126"   "EZHIP"    
## [25] "ZSCAN4"    "ZNF204P"   "STRC"      "SST"       "ZPBP2"     "RSPH1"    
## [31] "SPATA17"   "PACRG"     "MKX"       "C20orf85"  "PDYN"      "TTYH1"    
## [37] "CFAP46"    "EFCAB12"   "KBTBD11"   "DMRT1"    
## 
## $Module_8
##  [1] "LTN1"     "MIS18BP1" "THRAP3"   "USP47"    "OSBPL8"   "GOLIM4"  
##  [7] "NIPBL"    "WASL"     "WNK1"     "DYNC1H1"  "ATAD2"    "SYNCRIP" 
## [13] "EIF4G1"   "PRRC2C"   "ATRX"     "WASF2"    "GON4L"    "PNRC2"   
## [19] "FMNL2"    "NASP"     "SFSWAP"   "FXR1"     "PPP1R10"  "SRRM2"   
## [25] "SON"      "EHBP1"    "ILF3"     "ZBED6"    "DDX24"    "ATXN7L3B"
## [31] "SF3B1"    "UPF1"     "CCDC88A"  "LUZP1"    "SMC5"     "PPFIA1"  
## [37] "KMT2A"    "UBN2"     "TOP1"     "THOC2"   
## 
## $Module_9
##  [1] "PIGR"      "BPIFA2"    "KCNJ16"    "ODAM"      "SCGB3A1"   "LPO"      
##  [7] "MUC7"      "SLC13A5"   "SMR3B"     "DNASE2B"   "CST5"      "CXCL17"   
## [13] "SMR3A"     "PLEKHS1"   "FXYD2"     "PAX9"      "PRB4"      "BPIFB1"   
## [19] "DMBT1"     "C9orf152"  "PRB1"      "ELF5"      "TMEM213"   "CFTR"     
## [25] "ZG16B"     "FAM3D"     "TF"        "MYOC"      "SERPINA3"  "ETNPPL"   
## [31] "SCN7A"     "SOX10"     "CA6"       "FOLH1B"    "PRB3"      "GCNT3"    
## [37] "LINC01549" "NDRG2"     "C16orf89"  "LINC01482"
## 
## $Module_10
##  [1] "PRDX2"     "MZT2B"     "PPP6R2"    "ZC3H7B"    "MINDY1"    "XRCC2"    
##  [7] "ARHGEF1"   "TMEM106C"  "MEIOC"     "MTMR9"     "INTS4"     "HAUS2"    
## [13] "DIP2A"     "FAM161B"   "PODNL1"    "CCDC152"   "UNC45A"    "ZNF611"   
## [19] "UBQLN4"    "FBXW12"    "ENTPD6"    "SLF2"      "IWS1"      "LINC01692"
## [25] "NFKBID"    "ICAM4"     "PTAR1"     "C16orf92"  "WDR62"     "GVQW3"    
## [31] "EIF3C"     "BTBD18"    "FOXP1-IT1" "ARHGAP21"  "HCG18"     "GGA1"     
## [37] "GRM2"      "RAMP2-AS1" "TRIR"      "LINC01635"
## 
## $Module_11
##  [1] "LINC01777"  "OR6B1"      "IL9R"       "DKK4"       "LINC00896" 
##  [6] "LYZL4"      "LINC01364"  "DUSP21"     "PLA2G1B"    "CECR3"     
## [11] "MAB21L2"    "OR2H1"      "CST4"       "KYAT1"      "PCDHB1"    
## [16] "EDARADD"    "DEFA5"      "GPR78"      "KAAG1"      "SRARP"     
## [21] "DLGAP1-AS1" "OR5P2"      "SIM2"       "LINC02175"  "NXF5"      
## [26] "MAP3K11"    "OR1C1"      "HTR1B"      "TACR2"      "LINC01209" 
## [31] "POU6F2-AS2" "NKD1"       "ZNF486"     "OR10A4"     "SP5"       
## [36] "OLAH"       "TIMM17B"    "LINC00550"  "QPCTL"      "KIAA1328"  
## 
## $Module_12
##  [1] "CHCHD1"       "RPLP0"        "RPL6"         "PPA1"         "UBE2E1"      
##  [6] "RPL18"        "RPL10A"       "COX18"        "BLOC1S2"      "RPL11"       
## [11] "DPH5"         "RPL19"        "RPS18"        "RPS9"         "RPL8"        
## [16] "RPS16"        "RPS3"         "OXA1L"        "RPS14"        "EXOSC1"      
## [21] "RPS4X"        "CIAO2B"       "RPL30"        "RPL14"        "EPB41L4A-AS1"
## [26] "MAP1LC3B"     "RPL3"         "PABPC4"       "NOA1"         "RPL7"        
## [31] "EIF3A"        "NCOA4"        "RPL41"        "RPL35"        "LTA4H"       
## [36] "CCDC28A"      "IGBP1"        "RPL9"         "RPL36AL"      "PRDX4"       
## 
## $Module_13
##  [1] "MYH1"      "MYH2"      "CSRP3"     "MYBPC1"    "NRAP"      "MYL1"     
##  [7] "ANKRD1"    "HSFX3"     "TCAP"      "CKM"       "TTN"       "TNNC2"    
## [13] "ACTA1"     "MYBPC2"    "XIRP2"     "KLHL41"    "TNNC1"     "PYGM"     
## [19] "TNNT3"     "MYBPH"     "SYNPO2L"   "ANKRD31"   "PVALB"     "HSPB6"    
## [25] "COX6A2"    "LINC01352" "MYOT"      "CD22"      "XIRP1"     "ACTN2"    
## [31] "ARHGAP22"  "LINC00343" "VPREB3"    "CELA2B"    "AGR3"      "FAM47B"   
## [37] "TNNI2"     "CD19"      "CCDC120"   "ASB11"    
## 
## $Module_14
##  [1] "OR8D2"       "GP6"         "LINC00636"   "LINC02186"   "CCR9"       
##  [6] "CNTFR-AS1"   "SCTR"        "SLC6A2"      "ADAM7"       "AMELY"      
## [11] "TRHDE-AS1"   "SP8"         "HYAL4"       "SLC38A11"    "TSSK4"      
## [16] "LINC01180"   "PCDH15"      "LINC02458"   "LINC00862"   "MIR1915HG"  
## [21] "FGA"         "SLC30A8"     "ATP1A4"      "GNRHR"       "VN1R1"      
## [26] "MOBP"        "LINC00606"   "TMEM254-AS1" "MAGEE2"      "LINC02274"  
## [31] "CASC16"      "PRMT5-AS1"   "ZPLD1"       "TMEM171"     "CSTF3-DT"   
## [36] "FAM224A"     "OR2F2"       "C1orf100"    "TPT1-AS1"    "KIRREL3"
# 2. Generating adjacency matrix for hub genes from all modules
hub_genes<-unname(unlist(network_hub_list))
hubs_m<-TOM[hub_genes, hub_genes]
edges<-data.frame(from=rownames(hubs_m)[row(hubs_m)[upper.tri(hubs_m)]], 
           to=colnames(hubs_m)[col(hubs_m)[upper.tri(hubs_m)]], 
           value=hubs_m[upper.tri(hubs_m)])

# 3. Setting up thredshold for selecting leading edges in the network
leading_edges_ratio = 0.20 #top 1/5 edges based on edge weight.
threshold = quantile(edges$value, probs=1-leading_edges_ratio , na.rm = FALSE)
edges<-edges[edges$value >= threshold,]

# 4. Organizing vertices data frame and edges data frame for the network
vertices<-data.frame()
for(i in 1:length(network_hub_list)){
  module_name<-names(network_hub_list)[i]
  v<-data.frame(name = network_hub_list[[i]], module = rep(module_name, nTOM), size = rep(8, nTOM))
  vertices<-rbind(vertices, v)
}

vertices[,"group"]<-unlist(lapply(vertices$module, function(x) strsplit(x, "_")[[1]][2]))
vertices[,"id"]<-rownames(vertices)


# 5. Centrality analysis of hub genes co-expression network
g<-graph_from_data_frame(edges, directed = F, vertices)
eigenvector<-eigen_centrality(g)$vector
betweenness<-betweenness(g)
degree<-degree(g)
closeness<-closeness(g)
centralities <- cbind(degree, eigenvector, betweenness, closeness)
centrality_df<-cbind(centralities, vertices$module)

DT::datatable(centrality_df) #show centrality data of nodes for sorting
# 6. Visualize the hub genes co-expression network
edges[,"source"]<-vertices[match(edges$from, vertices$name), "id"]
edges[,"target"]<-vertices[match(edges$to, vertices$name), "id"]
edges$source<-as.numeric(edges$source)
edges$target<-as.numeric(edges$target)

edges<-data.frame(source = edges$source-1, target = edges$target-1, value = edges$value)
vertices<-cbind(vertices, centralities)
vertices[,"betweenness_sqrt"]<-sqrt(unlist(vertices$betweenness))

fn<-forceNetwork(Links = edges, Nodes = vertices,
                 Source = "source", Target = "target",
                 Nodesize = "betweenness_sqrt", fontSize = 12,
                 Value = "value", NodeID = "name",
                 Group = "group", opacity = 1,
                 legend = T, charge = -20, zoom = F,
                 opacityNoHover = 1,
                 linkWidth = JS("function(d) { return Math.cbrt(d.value)*0.4; }")
                 )
fn

Plotting the gene modules that closely co-expressed in MCC tumor samples

theme<-theme(panel.background = element_blank(),
             panel.border=element_rect(fill=NA),
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             strip.background=element_blank(),
             axis.text.x=element_text(colour="black"),
             axis.text.y=element_text(colour="black"),
             axis.ticks=element_line(colour="black"),
             plot.margin=unit(c(1,1,1,1),"line"),
             axis.title.x = element_text(color="black", size=18, face="bold"),
             axis.title.y = element_text(color="black", size=18, face="bold"),
             axis.text = element_text(size = 12, face = "bold"))

n = 25
em_m3_BP<-enrichGO(gene_df$Module_3$hgnc_symbol, 
                     OrgDb = org.Hs.eg.db, 
                     ont = "BP",
                     pAdjustMethod = "BH", 
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.25, 
                     keyType = 'SYMBOL')
m3_BP<-em_m3_BP@result[order(em_m3_BP@result$Count, decreasing = T),][1:n,]
pm3<-ggplot(m3_BP, aes(x=Count, y=factor(Description, levels = rev(c(m3_BP$Description))), size = Count, color = p.adjust))+
      geom_point() +
      ylab("Enriched GO Terms in module 3")+
      scale_colour_gradient(low = "#F16913", high = "#FDD0A2") +
      scale_fill_gradient(low = "#F16913", high = "#FDD0A2") +
      scale_y_discrete(labels = function(x) str_wrap(x, width = 40))+
      theme_set(theme_bw() +
      theme(legend.position = "right"))+
      theme

em_m5_BP<-enrichGO(gene_df$Module_5$hgnc_symbol, 
                     OrgDb = org.Hs.eg.db, 
                     ont = "BP",
                     pAdjustMethod = "BH", 
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.25, 
                     keyType = 'SYMBOL')
m5_BP<-em_m5_BP@result[order(em_m5_BP@result$Count, decreasing = T),][1:n,]
pm5<-ggplot(m5_BP, aes(x=Count, y=factor(Description, levels = rev(c(m5_BP$Description))), size = Count, color = p.adjust))+
      geom_point() +
      ylab("Enriched GO Terms in module 5")+
      scale_colour_gradient(low = "#238B45", high = "#A1D99B") +
      scale_fill_gradient(low = "#238B45", high = "#A1D99B") +
      scale_y_discrete(labels = function(x) str_wrap(x, width = 40))+
      theme_set(theme_bw() +
      theme(legend.position = "right"))+
      theme

em_m6_BP<-enrichGO(gene_df$Module_6$hgnc_symbol, 
                     OrgDb = org.Hs.eg.db, 
                     ont = "BP",
                     pAdjustMethod = "BH", 
                     pvalueCutoff = 0.05,
                     qvalueCutoff = 0.25, 
                     keyType = 'SYMBOL')
m6_BP<-em_m6_BP@result[order(em_m6_BP@result$Count, decreasing = T),][1:n,]
pm6<-ggplot(m6_BP, aes(x=Count, y=factor(Description, levels = rev(c(m6_BP$Description))), size = Count, color = p.adjust))+
      geom_point() +
      ylab("Enriched GO Terms in module 6")+
      scale_colour_gradient(low = "#74C476", high = "#E5F5E0") +
      scale_fill_gradient(low = "#74C476", high = "#E5F5E0") +
      scale_y_discrete(labels = function(x) str_wrap(x, width = 40))+
      theme_set(theme_bw() +
      theme(legend.position = "right"))+
      theme

WGCNA_module_GO_terms_plot<-ggarrange(pm3, pm5, pm6,
              ncol = 3,
              nrow = 1
) 
WGCNA_module_GO_terms_plot

IMR90 dynamic analysis of GOIs

Importing and organizing normalized IMR90 gene expression data

IMR90_edat<-readRDS("/xdisk/mpadi/jiawenyang/data/merkel_cell/pp_in_mcc_paper/imr90_wgcna/IMR90_edat.RDS")

Taking mean expression value per condition per time point

IMR90_ER<-IMR90_edat[, grep("ER", colnames(IMR90_edat))]
IMR90_GFP<-IMR90_edat[, grep("GFP", colnames(IMR90_edat))]

ER_time_point<-c("ER_0", "ER_4", "ER_8", 
                 "ER_12", "ER_16", "ER_20", 
                 "ER_24", "ER_32", "ER_40", 
                 "ER_48")
GFP_time_point<-c("GFP_0", "GFP_4", "GFP_8", 
                  "GFP_12", "GFP_16", "GFP_20", 
                  "GFP_24", "GFP_32", "GFP_40", 
                  "GFP_48")

IMR90_ER_mean<-data.frame(genes = rownames(IMR90_ER))
for (i in 1:length(ER_time_point)){
  time_point<-ER_time_point[i]
  sub_df<-as.data.frame(IMR90_ER[, grep(time_point, colnames(IMR90_ER))])
  sub_df[,"mean"]<-apply(sub_df, 1, mean)
  df_mean<-as.data.frame(sub_df[, "mean"])
  colnames(df_mean)<-time_point
  IMR90_ER_mean<-cbind(IMR90_ER_mean, df_mean)
}

IMR90_GFP_mean<-data.frame(genes = rownames(IMR90_GFP))
for (i in 1:length(GFP_time_point)){
  time_point<-GFP_time_point[i]
  sub_df<-as.data.frame(IMR90_GFP[, grep(time_point, colnames(IMR90_GFP))])
  sub_df[,"mean"]<-apply(sub_df, 1, mean)
  df_mean<-as.data.frame(sub_df[, "mean"])
  colnames(df_mean)<-time_point
  IMR90_GFP_mean<-cbind(IMR90_GFP_mean, df_mean)
}

IMR90_ER_mean_matrix<-as.matrix(IMR90_ER_mean[, 2:11])
IMR90_GFP_mean_matrix<-as.matrix(IMR90_GFP_mean[, 2:11])

IMR90_ER_GFP_DELTA<-IMR90_ER_mean_matrix-IMR90_GFP_mean_matrix

IMR90_ER_GFP_DELTA<-as.data.frame(IMR90_ER_GFP_DELTA)
rownames(IMR90_ER_GFP_DELTA)<-IMR90_ER_mean$genes

Adjusting gene expression value of ER samples by substracting the value of GFP samples at the same time point

IMR90_ER_GFP_DELTA_df<-reshape2::melt(as.matrix(IMR90_ER_GFP_DELTA))
colnames(IMR90_ER_GFP_DELTA_df)<-c("genes", "time", "ER-GFP")
IMR90_ER_GFP_DELTA_df$time<-unlist(lapply(as.character(IMR90_ER_GFP_DELTA_df$time), function(x) strsplit(x, "_")[[1]][2]))
IMR90_ER_GFP_DELTA_df$time<-as.numeric(IMR90_ER_GFP_DELTA_df$time)
IMR90_ER_GFP_DELTA_df$`ER-GFP`<-as.numeric(IMR90_ER_GFP_DELTA_df$`ER-GFP`)

Performing DEGs analysis for ER vs. GFP at 48 hours

group_48h_DEGs<-factor(c(rep("GFP_48",3), rep("ER_48",3)), levels = c("GFP_48", "ER_48"))
design<-model.matrix(~0 + group_48h_DEGs)
IMR90_48h_ER_GFP<-IMR90_edat[,c(grep("GFP_48",colnames(IMR90_edat)), grep("ER_48", colnames(IMR90_edat)))]
comparison = "group_48h_DEGsER_48 - group_48h_DEGsGFP_48"
cont=makeContrasts(comparison,levels=design)
fit=lmFit(IMR90_48h_ER_GFP,design)
vfit=contrasts.fit(fit,contrasts=cont)
efit<- eBayes(vfit)
DEGs_48h<-topTreat(efit, sort.by = "P", n = Inf)

Plotting the gene markers for MCC and neuroendocrine tumors in ER 48 hours samples

# gene markers
neuroendocrine_markers<-c("ENO2", "NEFM", "NEFH", "NMB", "HES6")
neural_genes<-c("EFNB1","SEMA4D", "SLIT2")
keratin_markers<-c("KRT8", "KRT18")
wnt_down_gene_list<-c("WNT5A","WNT5B", "FZD2", "FZD7","FZD8", "WNT16", "TCF7L2")
wnt_up_gene_list<-c("WNT3", "TCF7","TCF3","WNT9A","FZD9", "LEF1")

# graph settings
theme<-theme(panel.background = element_blank(),
             panel.border=element_rect(fill=NA),
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             strip.background=element_blank(),
             axis.text.x=element_text(colour="black"),
             axis.text.y=element_text(colour="black"),
             axis.ticks=element_line(colour="black"),
             plot.margin=unit(c(1,1,1,1),"line"),
             legend.title = element_text(face = "bold"),
             legend.text = element_text(face = "bold"),
             axis.title.x = element_text(color="black", size=18, face="bold"),
             axis.title.y = element_text(color="black", size=18, face="bold"),
             axis.text = element_text(size = 12, face = "bold"))
FC_df<-rbind(data.frame(genes =neuroendocrine_markers,logFC = DEGs_48h[neuroendocrine_markers,"logFC"],  makers = rep("neuroendocrine_markers", n = length(neuroendocrine_markers))),
             data.frame(genes =neural_genes, logFC = DEGs_48h[neural_genes,"logFC"], makers = rep("neural_genes", n = length(neural_genes))),
             data.frame(genes =keratin_markers, logFC = DEGs_48h[keratin_markers, "logFC"], makers = rep("keratin_markers", n = length(keratin_markers))))

# Bar plot to show the FC of ER vs.GFP at 48 hrs 
pdf(file = "/xdisk/mpadi/jiawenyang/result/pp_in_merk")
p<-ggplot(FC_df, aes(x=factor(genes, levels = genes), y=2^(logFC), fill=factor(makers, levels = c("neuroendocrine_markers", "neural_genes", "keratin_markers")))) +
   geom_bar(stat="identity")+
   guides(fill=guide_legend(title="Gene categories")) +
   xlab("")+
   ylab("Fold Change ER vs.GFP at 48 hrs") +
   geom_hline(yintercept = 1, color = "red", linetype = "twodash") +
   theme(legend.position="bottom")+
   theme
p

# Display the statistic data from DEGs anaylsis for 48 hrs ER and GFP samples 
DT::datatable(DEGs_48h[c(neuroendocrine_markers, neural_genes, keratin_markers),])

Plotting the MCC marker genes expression dynamic pattern in ER samples (adjusted by GFP)

# dynamic expression pattern of MCC markers 
NE_markers<-c("SEMA4D","NEFM","NEFH","SLIT2","HES6","EFNB1","ENO2","NMB","KRT8","KRT18")
NE_markers1<-c("SEMA4D", "NEFM", "NEFH", "SLIT2","HES6")
NE_markers2<-c("EFNB1", "ENO2","NMB","KRT8", "KRT18")

target_genes_df<-IMR90_edat[NE_markers,]
target_genes_df<-na.omit(target_genes_df)

df_target<-matrix(nrow = 110, ncol = 3)
df_target_sum<-c()
for (i in 1: length(rownames(target_genes_df))) {
  k<-rownames(target_genes_df)[i]
  df_target[,1]<-rep(k, 110)
  df_target[,2]<-as.character(target_genes_df[k,])
  df_target[,3]<-colnames(target_genes_df)
  df_target_sum<-rbind(df_target_sum, df_target)
}

#df_target_sum
time<-unlist(lapply(df_target_sum[,3], function(x) strsplit(x,"_")[[1]][2]))
repeats<-unlist(lapply(df_target_sum[,3], function(x) strsplit(x,"_")[[1]][3]))
df_target_sum<-Reduce(cbind, list(df_target_sum, time, repeats))
colnames(df_target_sum)<-c("genes","value","group","time","repeats")
df_target_sum<-as.data.frame(df_target_sum)
df_target_sum$value<-as.character(df_target_sum$value)
df_target_sum$time<-as.character(df_target_sum$time)

#Seperate data into groups
df_target_GFP<-df_target_sum[grep("GFP", df_target_sum[, "group"]),]
df_target_ER<-df_target_sum[grep("ER", df_target_sum[, "group"]),]

#GFP
df_target_GFP_plot<-data.frame()
df_target_GFP_plot_sum<-data.frame()
for (i in 1:length(unique(df_target_GFP[,"genes"]))){
  k<-unique(df_target_GFP[,"genes"])[i]
  for (j in unique(df_target_GFP$time)){
    df<-df_target_GFP %>% filter(genes == k, time == j)
    df_target_GFP_plot<-data.frame(genes = k, time = j, mean = mean(as.numeric(df$value)))
    df_target_GFP_plot_sum<-rbind(df_target_GFP_plot_sum, df_target_GFP_plot)
  }
}   

#ER
df_target_ER_plot<-data.frame()
df_target_ER_plot_sum<-data.frame()
for (i in 1:length(unique(df_target_ER$genes))){
  k<-unique(df_target_ER$genes)[i]
  for (j in unique(df_target_ER$time)){
    df<-df_target_ER %>% filter(genes == k, time == j)
    df_target_ER_plot<-data.frame(genes = k, time = j, mean = mean(as.numeric(df$value)))
    df_target_ER_plot_sum<-rbind(df_target_ER_plot_sum, df_target_ER_plot)
  }
}   

#foldchange to GFP at the same time point
df_target_ER_plot_sum$"foldchange"<-2^(as.numeric(df_target_ER_plot_sum$mean)-as.numeric(df_target_GFP_plot_sum$mean))
#log2FC to GFP at the same time point
df_target_ER_plot_sum$"log2_expression_value"<-as.numeric(df_target_ER_plot_sum$mean)-as.numeric(df_target_GFP_plot_sum$mean)


df_er_neuroendocrine_marker_up<-df_target_ER_plot_sum[which(df_target_ER_plot_sum$genes %in% NE_markers1),]
er_neuromarker_up <- ggplot(df_er_neuroendocrine_marker_up, aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)),y=log2_expression_value,colour=genes,group=genes,fill=genes)) +
  geom_line(size =1.2)+
  geom_point(size=1.8)+
  theme_classic()+
  scale_linetype_manual(values=c("twodash"))+
  geom_dl(aes(label = genes), method = list(dl.trans(x = x + 0.10), "last.points", cex = 1))+
  theme(text=element_text(size=25, face = "bold"))+
  ylim(0,4)+
  labs(x = 'time', y = "log2FC to GFP at the same time point",title = 'Neuroendocrine tumors marker genes expression fold changes \n to GFP condition in IMR90 cell with oncogenic ER antigen')
er_neuromarker_up

df_er_neuroendocrine_marker_flat<-df_target_ER_plot_sum[which(df_target_ER_plot_sum$genes %in% NE_markers2),]
er_neuromarker_flat <- ggplot(df_er_neuroendocrine_marker_flat, aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)),y=log2_expression_value,colour=genes,group=genes,fill=genes)) +
  geom_line(size =1.2)+
  geom_point(size=1.8)+
  theme_classic()+
  scale_linetype_manual(values=c("twodash"))+
  geom_dl(aes(label = genes), method = list(dl.trans(x = x + 0.10), "last.points", cex = 1))+
  theme(text=element_text(size=25, face = "bold"))+
  ylim(0,4)+
  labs(x = 'time', y = "log2FC to GFP at the same time point",title = 'Neuroendocrine tumors marker genes expression fold changes \n to GFP condition in IMR90 cell with oncogenic ER antigen')
er_neuromarker_flat

Plotting the WNT genes expression dynamic pattern in ER samples (adjusted by GFP)

# dynamic expression pattern of GOIs 
p_up<-ggplot(IMR90_ER_GFP_DELTA_df[IMR90_ER_GFP_DELTA_df$genes %in% wnt_up_gene_list, ], 
       aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)),
           y=`ER-GFP`, fill = genes, colour = genes, group = genes)) +
  geom_line(size =0.5)+
  geom_point(size=1.5)+
  geom_dl(aes(label = genes), method = list(dl.trans(x = x + 0.15), "last.points", cex = 0.7, face = "bold"))+
  ylim(-3, 2)+
  labs(x = 'time', y = "Relative gene expression level to control at the same time point")+
  theme(plot.title = element_text(size = 12, face = "bold"),
    legend.title=element_text(size=15), 
    legend.text=element_text(size=12))
p_up<-p_up+theme

p_down<-ggplot(IMR90_ER_GFP_DELTA_df[IMR90_ER_GFP_DELTA_df$genes %in% wnt_down_gene_list, ], 
       aes(x=factor(time, levels = c(0,4,8,12,16,20,24,32,40,48)),
           y=`ER-GFP`, fill = genes, colour = genes, group = genes)) +
  geom_line(size =0.5)+
  geom_point(size=1.5)+
  geom_dl(aes(label = genes), method = list(dl.trans(x = x + 0.15), "last.points", cex = 0.7, face = "bold"))+
  ylim(-3, 2)+
  labs(x = 'time', y = "Relative gene expression level to control at the same time point")+
  theme(plot.title = element_text(size = 12, face = "bold"),
    legend.title=element_text(size=15), 
    legend.text=element_text(size=12))
p_down<-p_down+theme

IMR90_WNT_genes_plot<-ggarrange(p_up, p_down,
              ncol = 2,
              nrow = 1
) 
IMR90_WNT_genes_plot

The dynamic pattern of gene-gene correlation in ER and GFP samples

Performing pearson correlation on GOIs in different conditions

pc.test<-function(pc, ...){
  pc<-as.matrix(pc)
  n<-ncol(pc)
  p.pc<-matrix(NA, n, n)
  diag(p.pc)<-0
  for (i in 1:(n-1)){
    for (j in (i + 1):n){
      tmp <- cor.test(pc[,i], pc[,j], ...)
      p.pc[i,j]<-p.pc[j,i]<-tmp$p.value
    }
  }
  colnames(p.pc) <- rownames(p.pc) <-colnames(pc)
  p.pc
}

pearson_cor_matrix<-function(condition, time_period){
  sample<-paste0(condition,"_",time_period)
  matrix<-edat[[sample]]
  result<-list()
  pc<-cor(as.matrix(t(matrix)), method = "pearson")
  sig<-pc.test(pc)
  result[["pc"]]<-pc
  result[["pc.pvalue"]]<-sig
  col<- colorRampPalette(c("blue", "white", "red"))(20)
  p_cor<-corrplot(pc, type="lower", method = "circle", order = "original", col = col,
           p.mat = sig, 
           sig.level = 0.05, 
           insig = "pch",
           pch.cex = 1.5,
           tl.col = "black",
           tl.cex = 1.5,
           number.cex = 7/ncol(pc),
           cl.cex = nrow(pc)*10/400)
  return(p_cor)
}
wnt_genes<-c("WNT3","WNT5B", "WNT9A", "FZD2", "FZD8", "FZD9", "LRP5", "TCF7", "TCF7L1", "WNT5A", "WNT16","FZD7", "FZD6", "FZD4", "FZD1", "LRP6","TCF7L2") #TCF3 shows similar pattern to TCF7L1, TCF4 shows similar pattern to TCF7L2
hippo_genes<-c("YAP1", "WWTR1", "LATS1", "LATS2")

IMR90_edat_MCC_wnt<-IMR90_edat[c(wnt_genes, hippo_genes, neuroendocrine_markers, neural_genes, keratin_markers), ]
IMR90_edat_MCC_wnt<-na.omit(IMR90_edat_MCC_wnt)

colnames(IMR90_edat_MCC_wnt)<-unlist(lapply(colnames(IMR90_edat_MCC_wnt), function(x) substring(x, 1, nchar(x)-2)))
colnames(IMR90_edat_MCC_wnt)<-gsub("_48", "_t5",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_40", "_t5",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_32", "_t4",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_24", "_t4",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_20", "_t3",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_16", "_t3",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_12", "_t2",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_8", "_t2",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_4", "_t1",colnames(IMR90_edat_MCC_wnt))
colnames(IMR90_edat_MCC_wnt)<-gsub("_0", "_t1",colnames(IMR90_edat_MCC_wnt))

conditions<-c("ER","GFP")
time_points<-c("t1", "t2","t3","t4","t5")

edat<-list()
for (i in 1:length(time_points)){
  time<-paste0("_",time_points[i])
  for (j in 1:length(conditions)){
    condition<-conditions[j]
    samples<-paste0(condition, time)
    df<-IMR90_edat_MCC_wnt[,which(colnames(IMR90_edat_MCC_wnt) == samples)]
    rownames(df)[which(rownames(df) %in% wnt_genes)]<-paste0(rownames(df)[which(rownames(df) %in% wnt_genes)], "_wnt")
    rownames(df)[which(rownames(df) %in% neural_genes)]<-paste0(rownames(df)[which(rownames(df) %in% neural_genes)], "_neural")
    rownames(df)[which(rownames(df) %in% hippo_genes)]<-paste0(rownames(df)[which(rownames(df) %in% hippo_genes)], "_hippo")
    rownames(df)[which(rownames(df) %in% keratin_markers)]<-paste0(rownames(df)[which(rownames(df) %in% keratin_markers)], "_keratin")
    rownames(df)[which(rownames(df) %in% neuroendocrine_markers)]<-paste0(rownames(df)[which(rownames(df) %in% neuroendocrine_markers)], "_NEmarker")
    edat[[samples]]<-df
  }
}

plotting correlation results of ER samples (left to right: t1, t2, t3, t4, t5)

par(mfrow = c(1,5))
condition = "ER"
  for (j in 1:length(time_points)){
    time_period<-time_points[j]
    sample_name<-paste0(condition, "_",time_period)
    p_cor<-pearson_cor_matrix(condition, time_period)
    assign(paste0(condition,"_",time_period), p_cor)
  }

plotting correlation results of GFP samples (left to right: t1, t2, t3, t4, t5)

par(mfrow = c(1,5))
condition = "GFP"
  for (j in 1:length(time_points)){
    time_period<-time_points[j]
    sample_name<-paste0(condition, "_",time_period)
    p_cor<-pearson_cor_matrix(condition, time_period)
    assign(paste0(condition,"_",time_period), p_cor)
  }

back to top

back to home